Shago commited on
Commit
81e781d
·
verified ·
1 Parent(s): ac9eb20

Upload 5 files

Browse files

Add project files

Files changed (5) hide show
  1. audio_analysis.py +48 -0
  2. llm.py +15 -0
  3. requirements.txt +12 -0
  4. tts.py +19 -0
  5. utils.py +44 -0
audio_analysis.py ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import librosa
2
+ import matplotlib.pyplot as plt
3
+ import pandas as pd
4
+ from utils import word_by_word_table
5
+ import shutil
6
+ import os
7
+
8
+
9
+ def compare_audio(tts_path, user_audio_path):
10
+ if isinstance(tts_path, str):
11
+ y_tts, sr_tts = librosa.load(tts_path, sr=None)
12
+ elif isinstance(tts_path, tuple):
13
+ sr_tts, y_tts = tts_path
14
+ else:
15
+ raise ValueError("Invalid gTTS input type")
16
+ if user_audio_path is None:
17
+ return None
18
+ if isinstance(user_audio_path, str):
19
+ y_user, sr_user = librosa.load(user_audio_path, sr=None)
20
+ elif isinstance(user_audio_path, tuple):
21
+ sr_user, y_user = user_audio_path
22
+ else:
23
+ raise ValueError("Invalid user audio input type")
24
+ min_len = min(len(y_tts), len(y_user))
25
+ y_tts, y_user = y_tts[:min_len], y_user[:min_len]
26
+ fig, ax = plt.subplots(3, 1, figsize=(10, 7))
27
+ ax[0].plot(y_tts)
28
+ ax[0].set_title("Reference (gTTS) Audio")
29
+ ax[1].plot(y_user)
30
+ ax[1].set_title("Your Recorded Audio")
31
+ ax[2].plot(y_tts - y_user, color="red")
32
+ ax[2].set_title("Difference (Reference - Recorded)")
33
+ plt.tight_layout()
34
+ return fig
35
+
36
+
37
+
38
+ def compare_both(tts_path, user_audio_path):
39
+ fig = compare_audio(tts_path, user_audio_path)
40
+ table = word_by_word_table(tts_path, user_audio_path)
41
+ return fig, table
42
+
43
+
44
+ def reset_all():
45
+ pycache_path = os.path.join(os.path.dirname(__file__), "__pycache__")
46
+ if os.path.exists(pycache_path):
47
+ shutil.rmtree(pycache_path)
48
+ return "", None, None, None, None
llm.py ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain_ollama import OllamaLLM
2
+ from langchain.prompts import ChatPromptTemplate
3
+ from langchain.schema.output_parser import StrOutputParser
4
+
5
+ def generate_sentences(topic, n=1):
6
+ prompt = ChatPromptTemplate.from_template(
7
+ "You are a helpful assistant. Generate exactly {n} simple sentences about the topic: {topic}. "
8
+ "Each sentence must be in English and appropriate for all audiences. "
9
+ "Return each sentence on a new line without any numbering or bullets"
10
+ )
11
+ model = OllamaLLM(model="gemma3n:e2b") # or any model you have
12
+ chain = prompt | model | StrOutputParser()
13
+ response = chain.invoke({"topic": topic, "n": n})
14
+ sentences = [s.strip() for s in response.splitlines() if s.strip()]
15
+ return sentences[:n]
requirements.txt ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ --find-links https://download.pytorch.org/whl/cpu
2
+ torch==2.3.0+cpu
3
+ transformers>=4.40.0
4
+ accelerate
5
+ langchain
6
+ librosa
7
+ matplotlib
8
+ pandas
9
+ gTTs
10
+ SpeechRecognition
11
+ pydub
12
+ gradio
tts.py ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from gtts import gTTS
2
+ import tempfile
3
+
4
+ # Language dictionary for dropdown
5
+ LANGUAGES = {
6
+ "English (US)": "en",
7
+ "Spanish": "es",
8
+ "French": "fr",
9
+ "Portuguese": "pt",
10
+ "Mandarin (China Mainland)": "zh-CN",
11
+ "Mandarin (Taiwan)": "zh-TW",
12
+ }
13
+
14
+
15
+ def generate_tts(text, lang_code):
16
+ tts = gTTS(text, lang=lang_code)
17
+ with tempfile.NamedTemporaryFile(suffix=".mp3", delete=False) as fp:
18
+ tts.save(fp.name)
19
+ return fp.name
utils.py ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import speech_recognition as sr
2
+ from pydub import AudioSegment
3
+ import os
4
+ import pandas as pd
5
+
6
+ def mp3_to_wav(mp3_path):
7
+ sound = AudioSegment.from_mp3(mp3_path)
8
+ wav_path = mp3_path.replace(".mp3", ".wav")
9
+ sound.export(wav_path, format="wav")
10
+ return wav_path
11
+
12
+ def transcribe(audio_path):
13
+ recognizer = sr.Recognizer()
14
+ wav_path = audio_path
15
+ if not audio_path.lower().endswith(".wav"):
16
+ wav_path = mp3_to_wav(audio_path)
17
+ with sr.AudioFile(wav_path) as source:
18
+ audio = recognizer.record(source)
19
+ try:
20
+ text = recognizer.recognize_google(audio)
21
+ except Exception as e:
22
+ text = ""
23
+ if wav_path != audio_path and os.path.exists(wav_path):
24
+ os.remove(wav_path)
25
+ return text
26
+
27
+ def word_by_word_table(ref_audio_path, user_audio_path):
28
+ ref_text = transcribe(ref_audio_path)
29
+ user_text = transcribe(user_audio_path)
30
+ ref_words = ref_text.strip().split()
31
+ user_words = user_text.strip().split()
32
+ max_len = max(len(ref_words), len(user_words))
33
+ rows = []
34
+ for i in range(max_len):
35
+ ref_word = ref_words[i] if i < len(ref_words) else ""
36
+ user_word = user_words[i] if i < len(user_words) else ""
37
+ match = ref_word.lower() == user_word.lower()
38
+ rows.append({
39
+ "Reference": ref_word,
40
+ "Your Attempt": user_word,
41
+ "Match": "✅" if match else "❌"
42
+ })
43
+ df = pd.DataFrame(rows)
44
+ return df