Spaces:

Shago
/

pronunciation_assessment

Running

App Files Files Community

Shago commited on Jul 7

Commit

81e781d

verified ·

1 Parent(s): ac9eb20

Upload 5 files

Browse files

Add project files

Files changed (5) hide show

audio_analysis.py +48 -0
llm.py +15 -0
requirements.txt +12 -0
tts.py +19 -0
utils.py +44 -0

audio_analysis.py ADDED Viewed

	@@ -0,0 +1,48 @@

+import librosa
+import matplotlib.pyplot as plt
+import pandas as pd
+from utils import word_by_word_table
+import shutil
+import os
+def compare_audio(tts_path, user_audio_path):
+    if isinstance(tts_path, str):
+        y_tts, sr_tts = librosa.load(tts_path, sr=None)
+    elif isinstance(tts_path, tuple):
+        sr_tts, y_tts = tts_path
+    else:
+        raise ValueError("Invalid gTTS input type")
+    if user_audio_path is None:
+        return None
+    if isinstance(user_audio_path, str):
+        y_user, sr_user = librosa.load(user_audio_path, sr=None)
+    elif isinstance(user_audio_path, tuple):
+        sr_user, y_user = user_audio_path
+    else:
+        raise ValueError("Invalid user audio input type")
+    min_len = min(len(y_tts), len(y_user))
+    y_tts, y_user = y_tts[:min_len], y_user[:min_len]
+    fig, ax = plt.subplots(3, 1, figsize=(10, 7))
+    ax[0].plot(y_tts)
+    ax[0].set_title("Reference (gTTS) Audio")
+    ax[1].plot(y_user)
+    ax[1].set_title("Your Recorded Audio")
+    ax[2].plot(y_tts - y_user, color="red")
+    ax[2].set_title("Difference (Reference - Recorded)")
+    plt.tight_layout()
+    return fig
+def compare_both(tts_path, user_audio_path):
+    fig = compare_audio(tts_path, user_audio_path)
+    table = word_by_word_table(tts_path, user_audio_path)
+    return fig, table
+def reset_all():
+    pycache_path = os.path.join(os.path.dirname(__file__), "__pycache__")
+    if os.path.exists(pycache_path):
+        shutil.rmtree(pycache_path)
+    return "", None, None, None, None

llm.py ADDED Viewed

	@@ -0,0 +1,15 @@

+from langchain_ollama import OllamaLLM
+from langchain.prompts import ChatPromptTemplate
+from langchain.schema.output_parser import StrOutputParser
+def generate_sentences(topic, n=1):
+    prompt = ChatPromptTemplate.from_template(
+        "You are a helpful assistant. Generate exactly {n} simple sentences about the topic: {topic}. "
+        "Each sentence must be in English and appropriate for all audiences. "
+        "Return each sentence on a new line without any numbering or bullets"
+    )
+    model = OllamaLLM(model="gemma3n:e2b")  # or any model you have
+    chain = prompt | model | StrOutputParser()
+    response = chain.invoke({"topic": topic, "n": n})
+    sentences = [s.strip() for s in response.splitlines() if s.strip()]
+    return sentences[:n]

requirements.txt ADDED Viewed

	@@ -0,0 +1,12 @@

+--find-links https://download.pytorch.org/whl/cpu
+torch==2.3.0+cpu
+transformers>=4.40.0
+accelerate
+langchain
+librosa
+matplotlib
+pandas
+gTTs
+SpeechRecognition
+pydub
+gradio

tts.py ADDED Viewed

	@@ -0,0 +1,19 @@

+from gtts import gTTS
+import tempfile
+# Language dictionary for dropdown
+LANGUAGES = {
+    "English (US)": "en",
+    "Spanish": "es",
+    "French": "fr",
+    "Portuguese": "pt",
+    "Mandarin (China Mainland)": "zh-CN",
+	"Mandarin (Taiwan)": "zh-TW",
+}
+def generate_tts(text, lang_code):
+    tts = gTTS(text, lang=lang_code)
+    with tempfile.NamedTemporaryFile(suffix=".mp3", delete=False) as fp:
+        tts.save(fp.name)
+        return fp.name

utils.py ADDED Viewed

	@@ -0,0 +1,44 @@

+import speech_recognition as sr
+from pydub import AudioSegment
+import os
+import pandas as pd
+def mp3_to_wav(mp3_path):
+    sound = AudioSegment.from_mp3(mp3_path)
+    wav_path = mp3_path.replace(".mp3", ".wav")
+    sound.export(wav_path, format="wav")
+    return wav_path
+def transcribe(audio_path):
+    recognizer = sr.Recognizer()
+    wav_path = audio_path
+    if not audio_path.lower().endswith(".wav"):
+        wav_path = mp3_to_wav(audio_path)
+    with sr.AudioFile(wav_path) as source:
+        audio = recognizer.record(source)
+    try:
+        text = recognizer.recognize_google(audio)
+    except Exception as e:
+        text = ""
+    if wav_path != audio_path and os.path.exists(wav_path):
+        os.remove(wav_path)
+    return text
+def word_by_word_table(ref_audio_path, user_audio_path):
+    ref_text = transcribe(ref_audio_path)
+    user_text = transcribe(user_audio_path)
+    ref_words = ref_text.strip().split()
+    user_words = user_text.strip().split()
+    max_len = max(len(ref_words), len(user_words))
+    rows = []
+    for i in range(max_len):
+        ref_word = ref_words[i] if i < len(ref_words) else ""
+        user_word = user_words[i] if i < len(user_words) else ""
+        match = ref_word.lower() == user_word.lower()
+        rows.append({
+            "Reference": ref_word,
+            "Your Attempt": user_word,
+            "Match": "✅" if match else "❌"
+        })
+    df = pd.DataFrame(rows)
+    return df