Spaces:
Running
Running
| import os | |
| # ===================================================== | |
| # OPTION A: Use ephemeral /tmp cache to avoid 50 GB quota | |
| # ====================================================== | |
| os.environ["TRANSFORMERS_CACHE"] = "/tmp/hf_cache" | |
| os.environ["HF_HOME"] = "/tmp/hf_home" | |
| os.environ["HF_DATASETS_CACHE"] = "/tmp/hf_datasets" | |
| os.environ["HF_MODULES_CACHE"] = "/tmp/hf_modules" | |
| # ====================================================== | |
| import torch | |
| import gradio as gr | |
| import faiss | |
| import numpy as np | |
| from transformers import AutoTokenizer, AutoModelForCausalLM | |
| from sentence_transformers import SentenceTransformer | |
| # ------------------------------------------------------ | |
| # 1️⃣ Model setup | |
| # ------------------------------------------------------ | |
| GEN_MODEL = "hackergeek/qwen3-harrison-rag" # main generation model | |
| EMB_MODEL = "sentence-transformers/all-MiniLM-L6-v2" # embedding model (lightweight) | |
| tokenizer = AutoTokenizer.from_pretrained(GEN_MODEL) | |
| model = AutoModelForCausalLM.from_pretrained( | |
| GEN_MODEL, | |
| cache_dir="/tmp/hf_cache", | |
| torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32, | |
| device_map="auto", | |
| low_cpu_mem_usage=True | |
| ) | |
| embedder = SentenceTransformer(EMB_MODEL, cache_folder="/tmp/hf_cache") | |
| # ------------------------------------------------------ | |
| # 2️⃣ Load and index documents | |
| # ------------------------------------------------------ | |
| # 3️⃣ Retrieval + generation logic | |
| # ------------------------------------------------------ | |
| def retrieve_context(query, k=5): | |
| q_emb = embedder.encode([query], convert_to_numpy=True) | |
| D, I = index.search(q_emb, k) | |
| return "\n\n".join([chunks[i] for i in I[0]]) | |
| def generate_response(query, history): | |
| context = retrieve_context(query) | |
| system_prompt = ( | |
| "You are a helpful assistant that uses the retrieved context to answer questions.\n\n" | |
| f"Context:\n{context}\n\n" | |
| f"User: {query}\nAssistant:" | |
| ) | |
| inputs = tokenizer(system_prompt, return_tensors="pt").to(model.device) | |
| output_ids = model.generate(**inputs, max_new_tokens=300) | |
| output = tokenizer.decode(output_ids[0], skip_special_tokens=True) | |
| return output.split("Assistant:")[-1].strip() | |
| def chat_fn(user_message, history): | |
| response = generate_response(user_message, history) | |
| history = history + [(user_message, response)] | |
| return history, history | |
| # ------------------------------------------------------ | |
| # 4️⃣ Gradio UI | |
| # ------------------------------------------------------ | |
| with gr.Blocks(title="Qwen3-Harrison-RAG Chatbot") as demo: | |
| gr.Markdown( | |
| """ | |
| # 🤖 Qwen3-Harrison-RAG Chatbot | |
| Ask me anything — I’ll retrieve relevant context and answer! | |
| """ | |
| ) | |
| chatbot = gr.Chatbot(height=400) | |
| with gr.Row(): | |
| msg = gr.Textbox(placeholder="Type your message here...", scale=4) | |
| clear = gr.Button("Clear", scale=1) | |
| msg.submit(chat_fn, [msg, chatbot], [chatbot, chatbot]) | |
| clear.click(lambda: None, None, chatbot, queue=False) | |
| # ------------------------------------------------------ | |
| # 5️⃣ Launch for Hugging Face Spaces | |
| # ------------------------------------------------------ | |
| if __name__ == "__main__": | |
| demo.launch() |