File size: 3,243 Bytes
67dd891
98b95ab
90461f2
98b95ab
 
 
 
 
 
 
 
67dd891
aed4587
67dd891
 
 
 
aed4587
98b95ab
 
 
 
 
aed4587
67dd891
 
 
98b95ab
67dd891
98b95ab
 
67dd891
aed4587
98b95ab
aed4587
98b95ab
 
aed4587
 
98b95ab
 
 
67dd891
 
 
 
aed4587
67dd891
 
 
98b95ab
67dd891
 
 
 
 
 
 
 
 
 
 
 
aed4587
98b95ab
 
 
 
 
 
 
 
 
 
67dd891
 
 
 
aed4587
67dd891
 
aed4587
98b95ab
 
 
aed4587
67dd891
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
import os

# =====================================================
# OPTION A: Use ephemeral /tmp cache to avoid 50 GB quota
# ======================================================
os.environ["TRANSFORMERS_CACHE"] = "/tmp/hf_cache"
os.environ["HF_HOME"] = "/tmp/hf_home"
os.environ["HF_DATASETS_CACHE"] = "/tmp/hf_datasets"
os.environ["HF_MODULES_CACHE"] = "/tmp/hf_modules"
# ======================================================

import torch
import gradio as gr
import faiss
import numpy as np
from transformers import AutoTokenizer, AutoModelForCausalLM
from sentence_transformers import SentenceTransformer

# ------------------------------------------------------
# 1️⃣ Model setup
# ------------------------------------------------------
GEN_MODEL = "hackergeek/qwen3-harrison-rag"      # main generation model
EMB_MODEL = "sentence-transformers/all-MiniLM-L6-v2"  # embedding model (lightweight)

tokenizer = AutoTokenizer.from_pretrained(GEN_MODEL)
model = AutoModelForCausalLM.from_pretrained(
    GEN_MODEL,
    cache_dir="/tmp/hf_cache",
    torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
    device_map="auto",
    low_cpu_mem_usage=True
)

embedder = SentenceTransformer(EMB_MODEL, cache_folder="/tmp/hf_cache")

# ------------------------------------------------------
# 2️⃣ Load and index documents


# ------------------------------------------------------
# 3️⃣ Retrieval + generation logic
# ------------------------------------------------------
def retrieve_context(query, k=5):
    q_emb = embedder.encode([query], convert_to_numpy=True)
    D, I = index.search(q_emb, k)
    return "\n\n".join([chunks[i] for i in I[0]])

def generate_response(query, history):
    context = retrieve_context(query)
    system_prompt = (
        "You are a helpful assistant that uses the retrieved context to answer questions.\n\n"
        f"Context:\n{context}\n\n"
        f"User: {query}\nAssistant:"
    )
    inputs = tokenizer(system_prompt, return_tensors="pt").to(model.device)
    output_ids = model.generate(**inputs, max_new_tokens=300)
    output = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    return output.split("Assistant:")[-1].strip()

def chat_fn(user_message, history):
    response = generate_response(user_message, history)
    history = history + [(user_message, response)]
    return history, history

# ------------------------------------------------------
# 4️⃣ Gradio UI
# ------------------------------------------------------
with gr.Blocks(title="Qwen3-Harrison-RAG Chatbot") as demo:
    gr.Markdown(
        """
        # 🤖 Qwen3-Harrison-RAG Chatbot  
        Ask me anything — I’ll retrieve relevant context and answer!
        """
    )
    chatbot = gr.Chatbot(height=400)
    with gr.Row():
        msg = gr.Textbox(placeholder="Type your message here...", scale=4)
        clear = gr.Button("Clear", scale=1)

    msg.submit(chat_fn, [msg, chatbot], [chatbot, chatbot])
    clear.click(lambda: None, None, chatbot, queue=False)

# ------------------------------------------------------
# 5️⃣ Launch for Hugging Face Spaces
# ------------------------------------------------------
if __name__ == "__main__":
    demo.launch()