Install Dependencies

pip install pyaudio 
pip install reverie-sdk
pip install python-dotenv
pip install pydub

How does it Works?

The main steps involved in the script are as follows:

  1. Environment Setup:
    The script loads essential API credentials (REVERIE_APP_ID and REVERIE_API_KEY) from environment variables using the dotenv package. These credentials are required to authenticate with the Reverie API.
Python
import os
import dotenv
from reverie_sdk import ReverieClient

dotenv.load_dotenv("../.env")
REVERIE_APP_ID = os.environ.get("REVERIE_APP_ID")
REVERIE_API_KEY = os.environ.get("REVERIE_API_KEY")
client = ReverieClient(
    api_key=REVERIE_API_KEY,
    app_id=REVERIE_APP_ID,
)
  1. Real-Time Audio Capture:
    The script uses pyaudio to capture real-time audio input from the microphone. This audio is then streamed asynchronously to the Reverie ASR (Automatic Speech Recognition) service for transcription into text.
Python
import pyaudio
import asyncio
from reverie_sdk.services.asr import AudioStream

stream = AudioStream()
pa = pyaudio.PyAudio()

def mic_callback(in_data, frame_count, time_info, status):
    try:
        asyncio.run(stream.add_chunk_async(in_data))
    except Exception as e:
        print(f"Error in mic callback: {e}")
        return (None, pyaudio.paAbort)
    return (None, pyaudio.paContinue)

pa_stream = pa.open(
    rate=16000,
    channels=1,
    format=pyaudio.paInt16,
    frames_per_buffer=1024,
    input=True,
    stream_callback=mic_callback,
)
pa_stream.start_stream()
  1. Speech-to-Text (STT) Conversion:
    The captured audio is processed and transcribed using Reverie’s ASR service. The transcription occurs in real-time, providing immediate feedback as the speech is converted into text.
Python
await client.asr.stt_stream_async(
    src_lang=source_lang,
    bytes_or_stream=stream,
    callback=asr_callback,
    format="16k_int16",
    punctuate="true",
)
  1. Translation:
    Once the transcription is complete, the text is sent to Reverie’s NMT (Neural Machine Translation) service to be translated from the source language (e.g., Hindi) to a target language (e.g., English). The translated text is returned for further processing.
Python
src_lang = "hi"
tgt_lang = "en"

resp: str = asyncio.run(speech_to_text(src_lang))

if len(resp.strip()) > 0:
    tgt_resp = (
        client.nmt.localization(
            [resp],
            src_lang=src_lang,
            tgt_lang=tgt_lang,
            enableTransliteration=False,
            enableNmt=True,
        )
        .responseList[0]
        .outString
    )
    print(tgt_resp)
  1. Text-to-Speech (TTS) Conversion:
    The translated text is then sent to Reverie’s TTS (Text-to-Speech) service to convert it into speech. The generated audio is played back to the user, providing a seamless speech-to-speech translation experience.
Python
tts_resp = client.tts.tts(
    f"{tgt_lang}_female",
    tgt_resp,
)
print(tts_resp)
  1. Optional Audio Saving:
    The generated speech audio can optionally be saved to a file (e.g., .wav) for later use or playback.
Python
tts_resp.save_audio("./sample.wav", overwrite_existing=True)
  1. Audio Playback:
    Using the pydub library, the resulting audio is played back in real-time to the user, completing the cycle of speech-to-text, translation, and speech-to-speech output.
Python
from pydub import AudioSegment, playback
import io

playback.play(AudioSegment.from_file(io.BytesIO(tts_resp.audio_bytes), format="wav"))
  1. Error Handling:
    The script includes error handling to manage potential issues during the microphone capture, transcription, translation, and speech synthesis processes.
Python
import traceback

try:
    # [Audio streaming, transcription, translation, and TTS code]
except Exception as err:
    print(f"Error in Speech-to-Text: {err}")
    traceback.print_exc()
finally:
    pa_stream.stop_stream()
    pa_stream.close()

Sample Code

import asyncio
import io
import os
import traceback
import pyaudio
from pydub import AudioSegment, playback
from reverie_sdk import ReverieClient
from reverie_sdk.services.asr import ReverieAsrResult, AudioStream
import dotenv

dotenv.load_dotenv("../.env")

REVERIE_APP_ID = os.environ.get("REVERIE_APP_ID")
REVERIE_API_KEY = os.environ.get("REVERIE_API_KEY")

client = ReverieClient(
    api_key=REVERIE_API_KEY,
    app_id=REVERIE_APP_ID,
)


async def speech_to_text(source_lang):
    stream = AudioStream()
    pa = pyaudio.PyAudio()

    def mic_callback(in_data, frame_count, time_info, status):
        try:
            asyncio.run(stream.add_chunk_async(in_data))
        except Exception as e:
            print(f"Error in mic callback: {e}")
            return (None, pyaudio.paAbort)
        return (None, pyaudio.paContinue)

    pa_stream = pa.open(
        rate=16000,
        channels=1,
        format=pyaudio.paInt16,
        frames_per_buffer=1024,
        input=True,
        stream_callback=mic_callback,
    )
    pa_stream.start_stream()

    print("Listening at STT..")

    asr_responses: list[ReverieAsrResult] = []

    def asr_callback(resp: ReverieAsrResult):
        print(resp)
        asr_responses.append(resp)

    try:
        await client.asr.stt_stream_async(
            src_lang=source_lang,
            bytes_or_stream=stream,
            callback=asr_callback,
            format="16k_int16",
            punctuate="true",
        )

        while True:
            if len(asr_responses) == 0:
                continue

            if asr_responses[-1].final:
                print(asr_responses[-1].display_text)
                return asr_responses[-1].display_text

    except Exception as err:
        print(f"Error in Speech-to-Text: {err}")
        traceback.print_exc()

    finally:
        pa_stream.stop_stream()
        pa_stream.close()


src_lang = "hi"
tgt_lang = "en"


resp: str = asyncio.run(speech_to_text(src_lang))

if len(resp.strip()) > 0:
    tgt_resp = (
        client.nmt.localization(
            [resp],
            src_lang=src_lang,
            tgt_lang=tgt_lang,
            enableTransliteration=False,
            enableNmt=True,
        )
        .responseList[0]
        .outString
    )
    print(tgt_resp)

    tts_resp = client.tts.tts(
        f"{tgt_lang}_female",
        tgt_resp,
    )
    print(tts_resp)
    
    # optional: save the audio
    # tts_resp.save_audio("./sample.wav", overwrite_existing=True)

    playback.play(AudioSegment.from_file(io.BytesIO(tts_resp.audio_bytes), format="wav"))