Here’s a ready-to-run Python GUI that does Voice → DeepL → Voice:
SpeechRecognition
)pyttsx3
)It’s designed to “just work” and gracefully fall back when optional pieces aren’t installed.
pip install tkinterdnd2 requests python-dotenv pyttsx3 sounddevice scipy numpy SpeechRecognition pydub
tkinter
ships with most Python installs (on Windows/macOS).
Linux users: you may needsudo apt-get install python3-tk portaudio19-dev
first.
Local Whisper (offline/online models, very accurate):
pip install -U openai-whisper # aka "whisper"
# You also need ffmpeg:
# Windows: install https://www.gyan.dev/ffmpeg/builds/ and add to PATH
# macOS: brew install ffmpeg
# Linux: sudo apt-get install ffmpeg
f Whisper isn’t available, the app falls back to Google Web Speech via
SpeechRecognition
(no key, internet required, lighter accuracy/limits).
.env
file next to the script:DEEPL_API_KEY=your_key_here
voice_deepl_voice.py
)#!/usr/bin/env python3
"""
Voice → DeepL → Voice
- Record microphone audio
- Transcribe via Whisper (if available) or Google Web Speech as fallback
- Translate via DeepL API (Free or Pro)
- Speak translated text with pyttsx3 (offline)
Requirements:
pip install requests python-dotenv pyttsx3 sounddevice scipy numpy SpeechRecognition pydub
Optional for better STT:
pip install -U openai-whisper
+ ffmpeg installed & in PATH
Run:
python voice_deepl_voice.py
"""
import os
import io
import time
import queue
import threading
from pathlib import Path
from dataclasses import dataclass
import numpy as np
import sounddevice as sd
from scipy.io.wavfile import write as wav_write
import requests
from tkinter import Tk, StringVar, BooleanVar, ttk, filedialog, messagebox
from dotenv import load_dotenv
import pyttsx3
# Optional imports (Whisper and SpeechRecognition)
HAS_WHISPER = False
try:
import whisper # openai-whisper
HAS_WHISPER = True
except Exception:
HAS_WHISPER = False
import speech_recognition as sr # Google Web Speech fallback
# ------------- Config -------------
APP_TITLE = "Voice → DeepL → Voice"
SAMPLE_RATE = 16000
CHANNELS = 1
BLOCK_SIZE = 1024
TARGET_LANG_DEFAULT = "EN"
DEEPL_PRO = "https://api.deepl.com"
DEEPL_FREE = "https://api-free.deepl.com"
SUPPORTED_TARGETS = [
"BG","CS","DA","DE","EL","EN","EN-GB","EN-US","ES","ET","FI","FR","HU","ID","IT","JA","KO",
"LT","LV","NB","NL","PL","PT","PT-BR","PT-PT","RO","RU","SK","SL","SV","TR","UK","ZH"
]
@dataclass
class RecordingState:
q: queue.Queue
buffers: list
recording: bool
stream: sd.InputStream | None
# ------------- Helpers -------------
def infer_deepl_base(api_key: str, force_free: bool) -> str:
if force_free:
return DEEPL_FREE
if api_key and api_key.endswith(":fx"):
return DEEPL_FREE
return DEEPL_PRO
def translate_deepl(text: str, api_key: str, target_lang: str, source_lang: str | None, force_free: bool=False) -> str:
base = infer_deepl_base(api_key, force_free)
url = f"{base}/v2/translate"
headers = {"Authorization": f"DeepL-Auth-Key {api_key}"}
data = {
"text": text,
"target_lang": target_lang
}
if source_lang:
data["source_lang"] = source_lang
r = requests.post(url, headers=headers, data=data, timeout=60)
r.raise_for_status()
js = r.json()
# DeepL returns {"translations":[{"detected_source_language":"DE","text":"..."}]}
return js["translations"][0]["text"]
def speak(text: str):
engine = pyttsx3.init()
engine.say(text)
engine.runAndWait()
def ensure_wav_path(tmp_dir: Path) -> Path:
tmp_dir.mkdir(parents=True, exist_ok=True)
return tmp_dir / f"recording_{int(time.time())}.wav"
def transcribe_with_whisper(wav_path: Path, model_name: str="base") -> str:
model = whisper.load_model(model_name) # "tiny", "base", "small", "medium", "large"
result = model.transcribe(str(wav_path), fp16=False)
return (result.get("text") or "").strip()
def transcribe_with_google_sr(wav_path: Path) -> str:
r = sr.Recognizer()
with sr.AudioFile(str(wav_path)) as source:
audio = r.record(source)
# You can set language hint like "de-DE" here if you like:
return r.recognize_google(audio) # raises on failure
# ------------- GUI -------------
class App:
def __init__(self, root: Tk):
self.root = root
root.title(APP_TITLE)
root.geometry("780x560")
root.minsize(720, 520)
load_dotenv()
self.api_key = StringVar(value=os.getenv("DEEPL_API_KEY", ""))
self.use_free = BooleanVar(value=False)
self.source_lang = StringVar(value="") # empty → auto
self.target_lang = StringVar(value=TARGET_LANG_DEFAULT)
self.auto_speak = BooleanVar(value=True)
self.use_whisper = BooleanVar(value=HAS_WHISPER)
self.whisper_model = StringVar(value="base")
self.status = StringVar(value="Ready.")
self.transcript = StringVar(value="")
self.translation = StringVar(value="")
self.rec_state = RecordingState(q=queue.Queue(), buffers=[], recording=False, stream=None)
self.wav_path: Path | None = None
self._build_ui()
def _build_ui(self):
pad = {"padx":8, "pady":6}
frm = ttk.Frame(self.root)
frm.pack(fill="both", expand=True)
# Row 0 - API key
ttk.Label(frm, text="DeepL API Key:").grid(row=0, column=0, sticky="e", **pad)
self.ent_api = ttk.Entry(frm, textvariable=self.api_key, width=48, show="•")
self.ent_api.grid(row=0, column=1, columnspan=2, sticky="we", **pad)
self.show_key = BooleanVar(value=False)
ttk.Checkbutton(frm, text="Show", variable=self.show_key, command=self._toggle_key).grid(row=0, column=3, sticky="w", **pad)
ttk.Checkbutton(frm, text="Use Free API (api-free.deepl.com)", variable=self.use_free).grid(row=1, column=1, sticky="w", **pad, columnspan=2)
# Row 2 - Langs
ttk.Label(frm, text="Source (optional):").grid(row=2, column=0, sticky="e", **pad)
self.ent_src = ttk.Entry(frm, textvariable=self.source_lang, width=10)
self.ent_src.grid(row=2, column=1, sticky="w", **pad)
ttk.Label(frm, text="Target:").grid(row=2, column=2, sticky="e", **pad)
self.cmb_tgt = ttk.Combobox(frm, state="readonly", values=SUPPORTED_TARGETS, textvariable=self.target_lang, width=10)
self.cmb_tgt.grid(row=2, column=3, sticky="w", **pad)
if self.target_lang.get() not in SUPPORTED_TARGETS:
self.cmb_tgt.set(TARGET_LANG_DEFAULT)
# Row 3 - STT settings
ttk.Checkbutton(frm, text=f"Use Whisper (installed: {HAS_WHISPER})", variable=self.use_whisper).grid(row=3, column=1, sticky="w", **pad)
ttk.Label(frm, text="Whisper model:").grid(row=3, column=2, sticky="e", **pad)
self.cmb_wm = ttk.Combobox(frm, state="readonly", values=["tiny","base","small","medium","large"], textvariable=self.whisper_model, width=10)
self.cmb_wm.grid(row=3, column=3, sticky="w", **pad)
# Row 4 - controls
self.btn_rec = ttk.Button(frm, text="● Record", command=self._start_record)
self.btn_rec.grid(row=4, column=1, sticky="we", **pad)
self.btn_stop = ttk.Button(frm, text="■ Stop", command=self._stop_record, state="disabled")
self.btn_stop.grid(row=4, column=2, sticky="we", **pad)
ttk.Button(frm, text="Open WAV…", command=self._open_wav).grid(row=4, column=3, sticky="we", **pad)
# Row 5 - actions
ttk.Button(frm, text="Transcribe", command=self._transcribe).grid(row=5, column=1, sticky="we", **pad)
ttk.Button(frm, text="Translate", command=self._translate).grid(row=5, column=2, sticky="we", **pad)
ttk.Checkbutton(frm, text="Auto speak after translate", variable=self.auto_speak).grid(row=5, column=3, sticky="w", **pad)
# Row 6 - transcript
ttk.Label(frm, text="Transcript:").grid(row=6, column=0, sticky="ne", **pad)
self.txt_trans = ttk.Treeview(frm, columns=("t",), show="tree", height=6)
self.txt_trans.grid(row=6, column=1, columnspan=3, sticky="nsew", **pad)
# Row 7 - translation
ttk.Label(frm, text="Translation:").grid(row=7, column=0, sticky="ne", **pad)
self.txt_transl = ttk.Treeview(frm, columns=("t",), show="tree", height=6)
self.txt_transl.grid(row=7, column=1, columnspan=3, sticky="nsew", **pad)
# Row 8 - status
ttk.Label(frm, text="Status:").grid(row=8, column=0, sticky="e", **pad)
self.lbl_status = ttk.Label(frm, textvariable=self.status)
self.lbl_status.grid(row=8, column=1, columnspan=3, sticky="w", **pad)
# stretch
frm.columnconfigure(1, weight=1)
frm.columnconfigure(2, weight=1)
frm.columnconfigure(3, weight=1)
frm.rowconfigure(6, weight=1)
frm.rowconfigure(7, weight=1)
# ---------- UI helpers ----------
def _toggle_key(self):
self.ent_api.config(show="" if self.show_key.get() else "•")
def _log(self, tree: ttk.Treeview, text: str):
tree.insert("", "end", text=text)
children = tree.get_children()
if children:
tree.see(children[-1])
def _set_status(self, s: str):
self.status.set(s)
self.root.update_idletasks()
# ---------- Recording ----------
def _audio_callback(self, indata, frames, time_info, status):
if status:
# You can log overflows/underflows here
pass
self.rec_state.q.put(indata.copy())
def _start_record(self):
if self.rec_state.recording:
return
self.rec_state.buffers.clear()
self.rec_state.q.queue.clear()
self._set_status("Recording…")
self.btn_rec.config(state="disabled")
self.btn_stop.config(state="normal")
def feeder():
while self.rec_state.recording:
try:
block = self.rec_state.q.get(timeout=0.2)
self.rec_state.buffers.append(block)
except queue.Empty:
continue
self.rec_state.recording = True
self.rec_state.stream = sd.InputStream(samplerate=SAMPLE_RATE, channels=CHANNELS,
blocksize=BLOCK_SIZE, dtype="float32",
callback=self._audio_callback)
self.rec_state.stream.start()
threading.Thread(target=feeder, daemon=True).start()
def _stop_record(self):
if not self.rec_state.recording:
return
self.rec_state.recording = False
try:
self.rec_state.stream.stop(); self.rec_state.stream.close()
except Exception:
pass
self.rec_state.stream = None
self._set_status("Saving WAV…")
data = np.concatenate(self.rec_state.buffers, axis=0) if self.rec_state.buffers else np.zeros((0, CHANNELS), dtype="float32")
data = (data * 32767).astype(np.int16) # float32 → int16
wav_path = ensure_wav_path(Path("./_tmp_audio"))
wav_write(str(wav_path), SAMPLE_RATE, data)
self.wav_path = wav_path
self._set_status(f"Saved: {wav_path}")
self.btn_rec.config(state="normal")
self.btn_stop.config(state="disabled")
def _open_wav(self):
p = filedialog.askopenfilename(title="Choose WAV", filetypes=[("WAV", "*.wav")])
if p:
self.wav_path = Path(p)
self._set_status(f"Selected WAV: {self.wav_path}")
# ---------- STT / Translate ----------
def _transcribe(self):
if not self.wav_path or not self.wav_path.exists():
messagebox.showwarning("No audio", "Record audio or choose a WAV first.")
return
self._set_status("Transcribing…")
self.txt_trans.delete(*self.txt_trans.get_children())
def worker():
try:
if self.use_whisper.get() and HAS_WHISPER:
text = transcribe_with_whisper(self.wav_path, self.whisper_model.get())
else:
text = transcribe_with_google_sr(self.wav_path)
text = text.strip()
self.root.after(0, lambda: self._log(self.txt_trans, text or "(empty)"))
self.root.after(0, lambda: self._set_status("Transcription OK."))
except Exception as e:
self.root.after(0, lambda: self._log(self.txt_trans, f"ERROR: {e}"))
self.root.after(0, lambda: self._set_status("Transcription failed."))
threading.Thread(target=worker, daemon=True).start()
def _translate(self):
# Get last transcript line
children = self.txt_trans.get_children()
if not children:
messagebox.showwarning("No transcript", "Transcribe first.")
return
last_text = self.txt_trans.item(children[-1], "text") or ""
last_text = last_text.strip()
if not last_text:
messagebox.showwarning("Empty text", "Transcript is empty.")
return
api_key = self.api_key.get().strip()
if not api_key:
messagebox.showwarning("Missing DeepL key", "Paste your DeepL API key or put it in .env")
return
tgt = self.target_lang.get().strip() or TARGET_LANG_DEFAULT
src = self.source_lang.get().strip() or None
self._set_status("Translating with DeepL…")
self.txt_transl.delete(*self.txt_transl.get_children())
def worker():
try:
out = translate_deepl(last_text, api_key, tgt, src, self.use_free.get())
out = out.strip()
self.root.after(0, lambda: self._log(self.txt_transl, out or "(empty)"))
self.root.after(0, lambda: self._set_status("Translation OK."))
if self.auto_speak.get() and out:
speak(out)
except Exception as e:
self.root.after(0, lambda: self._log(self.txt_transl, f"ERROR: {e}"))
self.root.after(0, lambda: self._set_status("Translation failed."))
threading.Thread(target=worker, daemon=True).start()
# ------------- Entry -------------
def main():
root = Tk()
try:
style = ttk.Style()
style.theme_use("clam")
except Exception:
pass
App(root)
root.mainloop()
if __name__ == "__main__":
main()
InputStream
at 16kHz mono; incoming blocks are collected in a queue and appended into buffers in a background thread._tmp_audio/recording_*.wav
).tiny/base/small/…
) and calls model.transcribe(wav)
.SpeechRecognition
as a fallback (recognize_google
).POST /v2/translate
with your DeepL API key, target language (and optional source).:fx
; otherwise api.deepl.com.engine.setProperty("voice", ...)
or engine.setProperty("rate", 180)
before speaking.pip install sounddevice
is usually enough (bundles binaries).sudo apt-get install portaudio19-dev
then reinstall sounddevice
.pip install -U openai-whisper
# plus ffmpeg installed & in PATH
If Whisper is heavy for your machine, switch to Google Web Speech by unchecking “Use Whisper”.
DeepL errors
target_lang
: choose from the dropdown (e.g., EN
, DE
, ES
, FR
, JA
, ZH
, etc.).Voice qualitypyttsx3
voices depend on your OS. On Windows, you can install additional SAPI5 voices; on macOS, install extra system voices.