Spaces:
Sleeping
Sleeping
File size: 3,243 Bytes
67dd891 98b95ab 90461f2 98b95ab 67dd891 aed4587 67dd891 aed4587 98b95ab aed4587 67dd891 98b95ab 67dd891 98b95ab 67dd891 aed4587 98b95ab aed4587 98b95ab aed4587 98b95ab 67dd891 aed4587 67dd891 98b95ab 67dd891 aed4587 98b95ab 67dd891 aed4587 67dd891 aed4587 98b95ab aed4587 67dd891 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 |
import os
# =====================================================
# OPTION A: Use ephemeral /tmp cache to avoid 50 GB quota
# ======================================================
os.environ["TRANSFORMERS_CACHE"] = "/tmp/hf_cache"
os.environ["HF_HOME"] = "/tmp/hf_home"
os.environ["HF_DATASETS_CACHE"] = "/tmp/hf_datasets"
os.environ["HF_MODULES_CACHE"] = "/tmp/hf_modules"
# ======================================================
import torch
import gradio as gr
import faiss
import numpy as np
from transformers import AutoTokenizer, AutoModelForCausalLM
from sentence_transformers import SentenceTransformer
# ------------------------------------------------------
# 1️⃣ Model setup
# ------------------------------------------------------
GEN_MODEL = "hackergeek/qwen3-harrison-rag" # main generation model
EMB_MODEL = "sentence-transformers/all-MiniLM-L6-v2" # embedding model (lightweight)
tokenizer = AutoTokenizer.from_pretrained(GEN_MODEL)
model = AutoModelForCausalLM.from_pretrained(
GEN_MODEL,
cache_dir="/tmp/hf_cache",
torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
device_map="auto",
low_cpu_mem_usage=True
)
embedder = SentenceTransformer(EMB_MODEL, cache_folder="/tmp/hf_cache")
# ------------------------------------------------------
# 2️⃣ Load and index documents
# ------------------------------------------------------
# 3️⃣ Retrieval + generation logic
# ------------------------------------------------------
def retrieve_context(query, k=5):
q_emb = embedder.encode([query], convert_to_numpy=True)
D, I = index.search(q_emb, k)
return "\n\n".join([chunks[i] for i in I[0]])
def generate_response(query, history):
context = retrieve_context(query)
system_prompt = (
"You are a helpful assistant that uses the retrieved context to answer questions.\n\n"
f"Context:\n{context}\n\n"
f"User: {query}\nAssistant:"
)
inputs = tokenizer(system_prompt, return_tensors="pt").to(model.device)
output_ids = model.generate(**inputs, max_new_tokens=300)
output = tokenizer.decode(output_ids[0], skip_special_tokens=True)
return output.split("Assistant:")[-1].strip()
def chat_fn(user_message, history):
response = generate_response(user_message, history)
history = history + [(user_message, response)]
return history, history
# ------------------------------------------------------
# 4️⃣ Gradio UI
# ------------------------------------------------------
with gr.Blocks(title="Qwen3-Harrison-RAG Chatbot") as demo:
gr.Markdown(
"""
# 🤖 Qwen3-Harrison-RAG Chatbot
Ask me anything — I’ll retrieve relevant context and answer!
"""
)
chatbot = gr.Chatbot(height=400)
with gr.Row():
msg = gr.Textbox(placeholder="Type your message here...", scale=4)
clear = gr.Button("Clear", scale=1)
msg.submit(chat_fn, [msg, chatbot], [chatbot, chatbot])
clear.click(lambda: None, None, chatbot, queue=False)
# ------------------------------------------------------
# 5️⃣ Launch for Hugging Face Spaces
# ------------------------------------------------------
if __name__ == "__main__":
demo.launch() |