Spaces:

hackergeek
/

HARRISON_GPT

Running

App Files Files Community

HARRISON_GPT / app.py

hackergeek

Update app.py

9ae4170 verified about 2 months ago

raw

history blame

3.24 kB

	import os

	# =====================================================
	# OPTION A: Use ephemeral /tmp cache to avoid 50 GB quota
	# ======================================================
	os.environ["TRANSFORMERS_CACHE"] = "/tmp/hf_cache"
	os.environ["HF_HOME"] = "/tmp/hf_home"
	os.environ["HF_DATASETS_CACHE"] = "/tmp/hf_datasets"
	os.environ["HF_MODULES_CACHE"] = "/tmp/hf_modules"
	# ======================================================

	import torch
	import gradio as gr
	import faiss
	import numpy as np
	from transformers import AutoTokenizer, AutoModelForCausalLM
	from sentence_transformers import SentenceTransformer

	# ------------------------------------------------------
	# 1️⃣ Model setup
	# ------------------------------------------------------
	GEN_MODEL = "hackergeek/qwen3-harrison-rag" # main generation model
	EMB_MODEL = "sentence-transformers/all-MiniLM-L6-v2" # embedding model (lightweight)

	tokenizer = AutoTokenizer.from_pretrained(GEN_MODEL)
	model = AutoModelForCausalLM.from_pretrained(
	GEN_MODEL,
	cache_dir="/tmp/hf_cache",
	torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
	device_map="auto",
	low_cpu_mem_usage=True
	)

	embedder = SentenceTransformer(EMB_MODEL, cache_folder="/tmp/hf_cache")

	# ------------------------------------------------------
	# 2️⃣ Load and index documents


	# ------------------------------------------------------
	# 3️⃣ Retrieval + generation logic
	# ------------------------------------------------------
	def retrieve_context(query, k=5):
	q_emb = embedder.encode([query], convert_to_numpy=True)
	D, I = index.search(q_emb, k)
	return "\n\n".join([chunks[i] for i in I[0]])

	def generate_response(query, history):
	context = retrieve_context(query)
	system_prompt = (
	"You are a helpful assistant that uses the retrieved context to answer questions.\n\n"
	f"Context:\n{context}\n\n"
	f"User: {query}\nAssistant:"
	)
	inputs = tokenizer(system_prompt, return_tensors="pt").to(model.device)
	output_ids = model.generate(**inputs, max_new_tokens=300)
	output = tokenizer.decode(output_ids[0], skip_special_tokens=True)
	return output.split("Assistant:")[-1].strip()

	def chat_fn(user_message, history):
	response = generate_response(user_message, history)
	history = history + [(user_message, response)]
	return history, history

	# ------------------------------------------------------
	# 4️⃣ Gradio UI
	# ------------------------------------------------------
	with gr.Blocks(title="Qwen3-Harrison-RAG Chatbot") as demo:
	gr.Markdown(
	"""
	# 🤖 Qwen3-Harrison-RAG Chatbot
	Ask me anything — I’ll retrieve relevant context and answer!
	"""
	)
	chatbot = gr.Chatbot(height=400)
	with gr.Row():
	msg = gr.Textbox(placeholder="Type your message here...", scale=4)
	clear = gr.Button("Clear", scale=1)

	msg.submit(chat_fn, [msg, chatbot], [chatbot, chatbot])
	clear.click(lambda: None, None, chatbot, queue=False)

	# ------------------------------------------------------
	# 5️⃣ Launch for Hugging Face Spaces
	# ------------------------------------------------------
	if __name__ == "__main__":
	demo.launch()