Spaces:
Running
Running
Upload 5 files
Browse filesAdd project files
- audio_analysis.py +48 -0
- llm.py +15 -0
- requirements.txt +12 -0
- tts.py +19 -0
- utils.py +44 -0
audio_analysis.py
ADDED
|
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import librosa
|
| 2 |
+
import matplotlib.pyplot as plt
|
| 3 |
+
import pandas as pd
|
| 4 |
+
from utils import word_by_word_table
|
| 5 |
+
import shutil
|
| 6 |
+
import os
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
def compare_audio(tts_path, user_audio_path):
|
| 10 |
+
if isinstance(tts_path, str):
|
| 11 |
+
y_tts, sr_tts = librosa.load(tts_path, sr=None)
|
| 12 |
+
elif isinstance(tts_path, tuple):
|
| 13 |
+
sr_tts, y_tts = tts_path
|
| 14 |
+
else:
|
| 15 |
+
raise ValueError("Invalid gTTS input type")
|
| 16 |
+
if user_audio_path is None:
|
| 17 |
+
return None
|
| 18 |
+
if isinstance(user_audio_path, str):
|
| 19 |
+
y_user, sr_user = librosa.load(user_audio_path, sr=None)
|
| 20 |
+
elif isinstance(user_audio_path, tuple):
|
| 21 |
+
sr_user, y_user = user_audio_path
|
| 22 |
+
else:
|
| 23 |
+
raise ValueError("Invalid user audio input type")
|
| 24 |
+
min_len = min(len(y_tts), len(y_user))
|
| 25 |
+
y_tts, y_user = y_tts[:min_len], y_user[:min_len]
|
| 26 |
+
fig, ax = plt.subplots(3, 1, figsize=(10, 7))
|
| 27 |
+
ax[0].plot(y_tts)
|
| 28 |
+
ax[0].set_title("Reference (gTTS) Audio")
|
| 29 |
+
ax[1].plot(y_user)
|
| 30 |
+
ax[1].set_title("Your Recorded Audio")
|
| 31 |
+
ax[2].plot(y_tts - y_user, color="red")
|
| 32 |
+
ax[2].set_title("Difference (Reference - Recorded)")
|
| 33 |
+
plt.tight_layout()
|
| 34 |
+
return fig
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
def compare_both(tts_path, user_audio_path):
|
| 39 |
+
fig = compare_audio(tts_path, user_audio_path)
|
| 40 |
+
table = word_by_word_table(tts_path, user_audio_path)
|
| 41 |
+
return fig, table
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
def reset_all():
|
| 45 |
+
pycache_path = os.path.join(os.path.dirname(__file__), "__pycache__")
|
| 46 |
+
if os.path.exists(pycache_path):
|
| 47 |
+
shutil.rmtree(pycache_path)
|
| 48 |
+
return "", None, None, None, None
|
llm.py
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from langchain_ollama import OllamaLLM
|
| 2 |
+
from langchain.prompts import ChatPromptTemplate
|
| 3 |
+
from langchain.schema.output_parser import StrOutputParser
|
| 4 |
+
|
| 5 |
+
def generate_sentences(topic, n=1):
|
| 6 |
+
prompt = ChatPromptTemplate.from_template(
|
| 7 |
+
"You are a helpful assistant. Generate exactly {n} simple sentences about the topic: {topic}. "
|
| 8 |
+
"Each sentence must be in English and appropriate for all audiences. "
|
| 9 |
+
"Return each sentence on a new line without any numbering or bullets"
|
| 10 |
+
)
|
| 11 |
+
model = OllamaLLM(model="gemma3n:e2b") # or any model you have
|
| 12 |
+
chain = prompt | model | StrOutputParser()
|
| 13 |
+
response = chain.invoke({"topic": topic, "n": n})
|
| 14 |
+
sentences = [s.strip() for s in response.splitlines() if s.strip()]
|
| 15 |
+
return sentences[:n]
|
requirements.txt
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
--find-links https://download.pytorch.org/whl/cpu
|
| 2 |
+
torch==2.3.0+cpu
|
| 3 |
+
transformers>=4.40.0
|
| 4 |
+
accelerate
|
| 5 |
+
langchain
|
| 6 |
+
librosa
|
| 7 |
+
matplotlib
|
| 8 |
+
pandas
|
| 9 |
+
gTTs
|
| 10 |
+
SpeechRecognition
|
| 11 |
+
pydub
|
| 12 |
+
gradio
|
tts.py
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from gtts import gTTS
|
| 2 |
+
import tempfile
|
| 3 |
+
|
| 4 |
+
# Language dictionary for dropdown
|
| 5 |
+
LANGUAGES = {
|
| 6 |
+
"English (US)": "en",
|
| 7 |
+
"Spanish": "es",
|
| 8 |
+
"French": "fr",
|
| 9 |
+
"Portuguese": "pt",
|
| 10 |
+
"Mandarin (China Mainland)": "zh-CN",
|
| 11 |
+
"Mandarin (Taiwan)": "zh-TW",
|
| 12 |
+
}
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
def generate_tts(text, lang_code):
|
| 16 |
+
tts = gTTS(text, lang=lang_code)
|
| 17 |
+
with tempfile.NamedTemporaryFile(suffix=".mp3", delete=False) as fp:
|
| 18 |
+
tts.save(fp.name)
|
| 19 |
+
return fp.name
|
utils.py
ADDED
|
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import speech_recognition as sr
|
| 2 |
+
from pydub import AudioSegment
|
| 3 |
+
import os
|
| 4 |
+
import pandas as pd
|
| 5 |
+
|
| 6 |
+
def mp3_to_wav(mp3_path):
|
| 7 |
+
sound = AudioSegment.from_mp3(mp3_path)
|
| 8 |
+
wav_path = mp3_path.replace(".mp3", ".wav")
|
| 9 |
+
sound.export(wav_path, format="wav")
|
| 10 |
+
return wav_path
|
| 11 |
+
|
| 12 |
+
def transcribe(audio_path):
|
| 13 |
+
recognizer = sr.Recognizer()
|
| 14 |
+
wav_path = audio_path
|
| 15 |
+
if not audio_path.lower().endswith(".wav"):
|
| 16 |
+
wav_path = mp3_to_wav(audio_path)
|
| 17 |
+
with sr.AudioFile(wav_path) as source:
|
| 18 |
+
audio = recognizer.record(source)
|
| 19 |
+
try:
|
| 20 |
+
text = recognizer.recognize_google(audio)
|
| 21 |
+
except Exception as e:
|
| 22 |
+
text = ""
|
| 23 |
+
if wav_path != audio_path and os.path.exists(wav_path):
|
| 24 |
+
os.remove(wav_path)
|
| 25 |
+
return text
|
| 26 |
+
|
| 27 |
+
def word_by_word_table(ref_audio_path, user_audio_path):
|
| 28 |
+
ref_text = transcribe(ref_audio_path)
|
| 29 |
+
user_text = transcribe(user_audio_path)
|
| 30 |
+
ref_words = ref_text.strip().split()
|
| 31 |
+
user_words = user_text.strip().split()
|
| 32 |
+
max_len = max(len(ref_words), len(user_words))
|
| 33 |
+
rows = []
|
| 34 |
+
for i in range(max_len):
|
| 35 |
+
ref_word = ref_words[i] if i < len(ref_words) else ""
|
| 36 |
+
user_word = user_words[i] if i < len(user_words) else ""
|
| 37 |
+
match = ref_word.lower() == user_word.lower()
|
| 38 |
+
rows.append({
|
| 39 |
+
"Reference": ref_word,
|
| 40 |
+
"Your Attempt": user_word,
|
| 41 |
+
"Match": "✅" if match else "❌"
|
| 42 |
+
})
|
| 43 |
+
df = pd.DataFrame(rows)
|
| 44 |
+
return df
|