import gradio as gr import pandas as pd import numpy as np import faiss from sentence_transformers import SentenceTransformer import joblib import os import traceback from datetime import datetime # Try to import langdetect, use fallback if not available try: from langdetect import detect, LangDetectException LANGDETECT_AVAILABLE = True except ImportError: print("⚠️ langdetect not available, using fallback language detection") LANGDETECT_AVAILABLE = False # =============================== # Load assets # =============================== print("🔄 Loading data and models...") df = pd.read_csv("clean_feedback.csv", encoding='utf-8') print("✅ CSV loaded with columns:", df.columns.tolist()) embeddings = np.load("embeddings.npy") print("✅ Embeddings loaded with shape:", embeddings.shape) index = faiss.read_index("feedback.index") print("✅ FAISS index loaded") clf = joblib.load("feedback_model.pkl") print("✅ Sentiment model loaded") model = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2", device="cpu") print("✅ SentenceTransformer ready") # File to store user submissions USER_FEEDBACK_FILE = "user_feedback.csv" # Initialize CSV with proper columns if it doesn't exist if not os.path.exists(USER_FEEDBACK_FILE): pd.DataFrame(columns=[ "Timestamp", "Sentence", "Predicted_Sentiment", "Confidence", "Language" ]).to_csv(USER_FEEDBACK_FILE, index=False, encoding='utf-8-sig') # =============================== # Language Detection Function # =============================== def detect_language(text): """ Detect language of the input text. Returns: 'Urdu', 'English', 'Roman Urdu', or 'Mixed' """ try: # Check if text contains Urdu script (Unicode range for Urdu/Arabic) urdu_chars = sum(1 for char in text if '\u0600' <= char <= '\u06FF') total_chars = len([c for c in text if c.isalpha()]) if total_chars == 0: return "Unknown" urdu_ratio = urdu_chars / total_chars if total_chars > 0 else 0 # If more than 50% Urdu characters if urdu_ratio > 0.5: return "Urdu" # If some Urdu characters mixed with English if urdu_ratio > 0: return "Mixed (Urdu+English)" # For English/Roman Urdu detection if LANGDETECT_AVAILABLE: try: lang_code = detect(text) if lang_code == 'ur': return "Urdu" except: pass # Check for Roman Urdu indicators (common transliterated words) text_lower = text.lower() roman_urdu_words = [ 'hai', 'nahi', 'acha', 'achchha', 'bohot', 'bahut', 'bhi', 'kya', 'kaise', 'theek', 'thik', 'matlab', 'samajh', 'bilkul', 'yar', 'yaar', 'par', 'lekin', 'aur', 'ka', 'ki', 'ko', 'se', 'me', 'mein', 'hain', 'tha', 'thi', 'gaya', 'gayi', 'karna', 'karo', 'kuch', 'sab', 'bahut', 'zyada', 'kam', 'achha', 'bura' ] # Count Roman Urdu word matches matches = sum(1 for word in roman_urdu_words if word in text_lower.split()) if matches >= 2: # If 2 or more Roman Urdu words found return "Roman Urdu" elif matches >= 1: return "Mixed (Roman Urdu+English)" return "English" except Exception as e: print(f"⚠️ Language detection error: {e}") return "Unknown" # =============================== # Core classification function # =============================== def classify_feedback(text, top_k=5, progress=gr.Progress()): try: if not text or not text.strip(): empty_df = pd.read_csv(USER_FEEDBACK_FILE, encoding='utf-8-sig') return ( gr.update(value="⚠️ Please enter a feedback text.", visible=True), gr.update(value="", visible=False), empty_df, gr.update(visible=False), gr.update(visible=False), gr.update(visible=False) ) progress(0.1, desc="🔍 Analyzing text...") # Embed query query_emb = model.encode([text]) progress(0.3, desc="🔍 Finding similar feedbacks...") # Retrieve similar sentences distances, indices = index.search(query_emb, top_k) retrieved = df.iloc[indices[0]] progress(0.6, desc="🤖 Classifying sentiment...") # Predict sentiment & probability probs_all = clf.predict_proba(query_emb)[0] sentiment = clf.classes_[np.argmax(probs_all)] confidence = np.max(probs_all) # Detect language language = detect_language(text) progress(0.8, desc="💾 Saving results...") # Extract only sentences for copying similar_sentences = retrieved['Sentence'].tolist() sentences_text = "\n".join([f"{i+1}. {s}" for i, s in enumerate(similar_sentences)]) # Format similar examples with similarity scores and sentiment examples = "\n".join( [f"**{i+1}.** [{retrieved.iloc[i].get('Sentiment', 'N/A')}] {s} \n*Similarity: {(1 - distances[0][i]):.1%}*" for i, s in enumerate(similar_sentences)] ) # Save user submission with timestamp and language timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S") new_row = pd.DataFrame([{ "Timestamp": timestamp, "Sentence": text, "Predicted_Sentiment": sentiment, "Confidence": round(confidence, 4), "Language": language }]) # Read existing data and append with proper encoding existing = pd.read_csv(USER_FEEDBACK_FILE, encoding='utf-8-sig') updated = pd.concat([existing, new_row], ignore_index=True) updated.to_csv(USER_FEEDBACK_FILE, index=False, encoding='utf-8-sig') progress(1.0, desc="✅ Complete!") print(f"✅ Prediction: {sentiment} ({confidence:.2f}) | Language: {language}") # Determine sentiment color sentiment_color = { "Positive": "🟢", "Negative": "🔴", "Neutral": "🟡" }.get(sentiment, "⚪") # Return formatted output return ( gr.update(visible=False), # error box gr.update(value=sentences_text, visible=True), # similar sentences updated.sort_values('Timestamp', ascending=False), # table gr.update(visible=True, value=f"{sentiment_color} **{sentiment}**"), # sentiment badge gr.update(visible=True, value=f"**{confidence:.1%}**"), # confidence badge gr.update(visible=True, value=f"**{language}**") # language badge ) except Exception as e: tb = traceback.format_exc() print("❌ Error:", tb) try: existing_df = pd.read_csv(USER_FEEDBACK_FILE, encoding='utf-8-sig') return ( gr.update(value=f"❌ **Error occurred:**\n```\n{tb}\n```", visible=True), gr.update(visible=False), existing_df, gr.update(visible=False), gr.update(visible=False), gr.update(visible=False) ) except: return ( gr.update(value=f"❌ **Error occurred:**\n```\n{tb}\n```", visible=True), gr.update(visible=False), pd.DataFrame(), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False) ) # =============================== # Clear history function # =============================== def clear_history(): """Clear all user feedback history""" try: empty_df = pd.DataFrame(columns=[ "Timestamp", "Sentence", "Predicted_Sentiment", "Confidence", "Language" ]) empty_df.to_csv(USER_FEEDBACK_FILE, index=False, encoding='utf-8-sig') return "✅ History cleared successfully!", empty_df, USER_FEEDBACK_FILE except Exception as e: existing_df = pd.read_csv(USER_FEEDBACK_FILE, encoding='utf-8-sig') return f"❌ Error clearing history: {str(e)}", existing_df, USER_FEEDBACK_FILE # =============================== # Load initial data function # =============================== def load_initial_data(): try: if os.path.exists(USER_FEEDBACK_FILE): df_temp = pd.read_csv(USER_FEEDBACK_FILE, encoding='utf-8-sig') if len(df_temp) > 0: return df_temp.sort_values('Timestamp', ascending=False) return pd.DataFrame(columns=["Timestamp", "Sentence", "Predicted_Sentiment", "Confidence", "Language"]) except: return pd.DataFrame(columns=["Timestamp", "Sentence", "Predicted_Sentiment", "Confidence", "Language"]) # =============================== # Gradio Interface # =============================== custom_css = """ .rtl-text textarea { direction: rtl; text-align: right; font-family: 'Noto Nastaliq Urdu', 'Jameel Noori Nastaleeq', 'Arial', sans-serif; font-size: 16px; } .input-card { background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); border-radius: 12px; padding: 20px; color: white; } .result-card { background: #f8f9fa; border-radius: 12px; padding: 20px; border-left: 4px solid #667eea; } .sentiment-badge { display: inline-block; padding: 8px 16px; border-radius: 20px; font-weight: bold; margin: 5px; } .badge-positive { background: #d4edda; color: #155724; } .badge-negative { background: #f8d7da; color: #721c24; } .badge-neutral { background: #fff3cd; color: #856404; } .stats-row { display: flex; gap: 10px; margin: 10px 0; } .stat-box { flex: 1; background: white; padding: 15px; border-radius: 8px; box-shadow: 0 2px 4px rgba(0,0,0,0.1); text-align: center; } """ with gr.Blocks(theme=gr.themes.Soft(), css=custom_css, fill_width=True) as demo: # Header gr.Markdown( """ # 🎓 Student Feedback RAG System ### Multilingual Sentiment Analysis for Urdu, Roman Urdu, and English This intelligent system analyzes student feedback using **Retrieval-Augmented Generation (RAG)** and provides similar examples from the database. All submissions are automatically saved with timestamp, sentiment analysis, confidence scores, and automatic language detection. """ ) # Main Input Section with gr.Row(): with gr.Column(scale=2): with gr.Group(): input_text = gr.Textbox( label="✍️ Enter Student Feedback", placeholder="اپنی رائے یہاں لکھیں | Type your feedback here in Urdu, Roman Urdu, or English...", lines=4, elem_classes=["rtl-text"], show_copy_button=True ) with gr.Row(): submit_btn = gr.Button( "🔍 Analyze Feedback", variant="primary", size="lg", scale=2 ) clear_input_btn = gr.Button( "🧹 Clear", variant="secondary", size="lg", scale=1 ) # Error message box (hidden by default) error_box = gr.Markdown(visible=False) with gr.Column(scale=1): gr.Markdown( """ ### 🌐 Supported Languages | Language | Script | Status | |----------|--------|--------| | **Urdu** | اردو | ✅ Full Support | | **Roman Urdu** | Latin | ✅ Full Support | | **English** | English | ✅ Full Support | | **Mixed** | Mixed | ✅ Full Support | ### 🎯 Sentiment Classes - 🟢 **Positive** - Favorable feedback - 🔴 **Negative** - Critical feedback - 🟡 **Neutral** - Balanced/Objective feedback """ ) # Results Section (hidden initially) with gr.Row(visible=False) as results_row: with gr.Column(): gr.Markdown("## 📊 Analysis Results") with gr.Row(): with gr.Column(scale=1): sentiment_badge = gr.Markdown( label="Predicted Sentiment", visible=False ) with gr.Column(scale=1): confidence_badge = gr.Markdown( label="Confidence", visible=False ) with gr.Column(scale=1): language_badge = gr.Markdown( label="Detected Language", visible=False ) # Similar sentences section similar_sentences_box = gr.Textbox( label="📋 Similar Sentences from Database (Copy to Clipboard)", lines=6, max_lines=10, interactive=False, show_copy_button=True, visible=False, container=True ) # Similar Examples Section gr.Markdown("---") gr.Markdown("## 🔍 Similar Examples from Database") examples_output = gr.Markdown() # History Section gr.Markdown("---") gr.Markdown("## 🗂️ Submission History") with gr.Row(): with gr.Column(scale=4): # FIXED: Removed 'height' parameter, using 'max_height' instead output_table = gr.Dataframe( label="Recent Submissions", wrap=True, max_height=300, # Changed from height to max_height show_row_numbers=True, # Added for better UX show_copy_button=True, # Added for convenience show_fullscreen_button=True # Added for better viewing ) with gr.Column(scale=1): with gr.Row(): clear_btn = gr.Button( "🗑️ Clear History", variant="stop", size="sm" ) with gr.Row(): download_btn = gr.DownloadButton( label="📥 Download CSV", value=USER_FEEDBACK_FILE, variant="secondary", size="sm" ) clear_output = gr.Markdown() # Footer gr.Markdown( """ --- ### 💡 Tips: - Type in **Urdu (اردو)**, **Roman Urdu**, or **English** - the system auto-detects the language - The system uses **FAISS** for fast similarity search across thousands of feedback entries - **Confidence scores** above 80% indicate high reliability predictions - All data is saved with **UTF-8 encoding** to properly handle Urdu script """ ) # Event handlers def process_and_show(text, progress=gr.Progress()): error_msg, sentences, table, sent_badge, conf_badge, lang_badge = classify_feedback(text, progress=progress) # Show results row if successful show_results = sent_badge.get("visible", False) if isinstance(sent_badge, dict) else sent_badge.visible if hasattr(sent_badge, 'visible') else False return { error_box: error_msg, similar_sentences_box: sentences, output_table: table, sentiment_badge: sent_badge, confidence_badge: conf_badge, language_badge: lang_badge, results_row: gr.update(visible=show_results) } submit_btn.click( fn=process_and_show, inputs=[input_text], outputs=[error_box, similar_sentences_box, output_table, sentiment_badge, confidence_badge, language_badge, results_row] ) # Also allow Enter key to submit input_text.submit( fn=process_and_show, inputs=[input_text], outputs=[error_box, similar_sentences_box, output_table, sentiment_badge, confidence_badge, language_badge, results_row] ) def clear_and_update(): msg, df, file_path = clear_history() return msg, df, file_path clear_btn.click( fn=clear_and_update, inputs=[], outputs=[clear_output, output_table, download_btn] ) def clear_input(): return "", gr.update(visible=False), gr.update(visible=False), gr.update(value="") clear_input_btn.click( fn=clear_input, inputs=[], outputs=[input_text, results_row, error_box, examples_output] ) # Load existing data on startup demo.load( fn=load_initial_data, inputs=[], outputs=[output_table] ) # Launch the app if __name__ == "__main__": demo.launch( server_name="0.0.0.0", server_port=7860, share=False, show_error=True )