Spaces:
Running
on
Zero
Running
on
Zero
File size: 7,216 Bytes
da2da03 1eca919 da2da03 75684b1 da2da03 7b15c89 da2da03 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 |
import os
import gradio as gr
import pickle
import torch
from tqdm import tqdm
from web_helper import get_html, find_wiki_links, get_markdown_from_html, get_markdown_from_url
# --- Hugging Face & Model Configuration ---
HF_TOKEN = os.getenv('HF_TOKEN')
EMBEDDING_MODEL_ID = "google/embeddinggemma-300M"
LLM_MODEL_ID = "google/gemma-3-12B-it"
# --- Data Source Configuration ---
BASE_URL = "https://hollowknight.wiki"
GAME_KNOWLEDGE_DATA = [
{
"title": "Hollow Knight",
"cache_folder": "1_cache",
"category_list": [
{
"entry": "/w/Category:Bosses_(Hollow_Knight)",
"cache": "hollow_knight_bosses.pkl",
"label": "Bosses",
},
],
},
{
"title": "Silksong",
"cache_folder": "2_cache",
"category_list": [
{
"entry": "/w/Hornet_(Silksong)",
"cache": "silksong_hornet.pkl",
"label": "General",
},
{
"entry": "/w/Hollow_Knight:_Silksong",
"cache": "silksong_game.pkl",
"label": "General",
},
{
"entry": "/w/Category:Areas_(Silksong)",
"cache": "silksong_areas.pkl",
"label": "Areas",
},
{
"entry": "/w/Category:Bosses_(Silksong)",
"cache": "silksong_bosses.pkl",
"label": "Bosses",
},
{
"entry": "/w/Category:Items_(Silksong)",
"cache": "silksong_items.pkl",
"label": "Items",
},
{
"entry": "/w/Category:NPCs_(Silksong)",
"cache": "silksong_npcs.pkl",
"label": "NPCs",
},
{
"entry": "/w/Tasks",
"cache": "silksong_tasks.pkl",
"label": "Tasks",
},
{
"entry": "/w/Category:Crests_and_Skills",
"cache": "silksong_crests_and_skills.pkl",
"label": "Crests and Skills",
},
{
"entry": "/w/Category:Tools",
"cache": "silksong_tools.pkl",
"label": "Tools",
},
{
"entry": "/w/Category:Abilities_(Silksong)",
"cache": "silksong_abilities.pkl",
"label": "Abilities",
},
],
},
]
def get_all_game_data(embedding_model):
"""Loops through the config and processes/loads all knowledge sources."""
print("\n--- Processing Game Data ---")
knowledge_base = {}
for item in GAME_KNOWLEDGE_DATA:
title = item['title']
knowledge_base[title] = []
for category in item['category_list']:
cache_path = f"""{item["cache_folder"]}/{category["cache"]}"""
knowledge_base[title] += _load_or_process_source(
category['entry'],
cache_path,
category['label'],
embedding_model
)
return knowledge_base
# --- DATA PROCESSING & CACHING ---
# Scrapes data and generates embeddings, using a cache to avoid re-running.
def _clean_text(text: str) -> str:
"""Removes the references section from the raw text."""
return text.split("References\n----------\n", 1)[0].strip()
@torch.no_grad()
def _create_data_entry(text: str, doc_path: str, label: str, embedding_model) -> dict | None:
"""Creates a single structured data entry with text, metadata, and embedding."""
cleaned_text = _clean_text(text)
if not cleaned_text:
return None
title = doc_path.split('/')[-1]
# Encode returns a numpy array; convert to tensor for stacking later.
embedding = embedding_model.encode(cleaned_text, prompt=f"title: {title} | text: ")
return {
"text": cleaned_text,
"embedding": torch.tensor(embedding), ### Store as tensor for faster processing
"metadata": {
"category": label,
"source": BASE_URL + doc_path,
"title": title
}
}
def _load_or_process_source(entry_point: str, cache_file: str, label: str, embedding_model):
"""
Loads processed data from a cache file if it exists. Otherwise, scrapes,
processes, generates embeddings, and saves to the cache.
"""
if os.path.exists(cache_file):
print(f"✅ Found cache for {label}. Loading data from '{cache_file}'...")
with open(cache_file, 'rb') as f:
return pickle.load(f)
print(f"ℹ️ No cache for {label}. Starting data scraping and processing...")
processed_data = []
main_page_html = get_html(BASE_URL + entry_point)
data_entry = _create_data_entry(get_markdown_from_html(main_page_html), entry_point, label, embedding_model)
if (data_entry):
processed_data.append(data_entry)
extracted_links = find_wiki_links(main_page_html)
for doc_path in tqdm(extracted_links, desc=f"Processing {label} Pages"):
full_url = BASE_URL + doc_path
text = get_markdown_from_url(full_url)
data_entry = _create_data_entry(text, doc_path, label, embedding_model)
if data_entry:
processed_data.append(data_entry)
print(f"✅ {label} processing complete. Saving {len(processed_data)} entries to '{cache_file}'...")
os.makedirs(os.path.dirname(cache_file), exist_ok=True)
with open(cache_file, 'wb') as f:
pickle.dump(processed_data, f)
return processed_data
# --- App Logic Configuration ---
BASE_SIMILARITY_THRESHOLD = 0.2
FOLLOWUP_SIMILARITY_THRESHOLD = 0.5
DEFAULT_MESSAGE_NO_MATCH = "I'm sorry, I can't find a relevant document to answer that question."
# --- Gradio UI Configuration ---
silksong_theme = gr.themes.Default(
primary_hue=gr.themes.colors.red,
secondary_hue=gr.themes.colors.zinc,
neutral_hue=gr.themes.colors.zinc,
font=[gr.themes.GoogleFont("IM Fell English"), "ui-sans-serif", "system-ui", "sans-serif"],
)
silksong_css="""
.gradio-container {
background-image: linear-gradient(rgba(255,255,255, 0.5), rgba(255, 255, 255, 1.0)), url("/gradio_api/file=assets/background.jpg");
background-size: 100%;
background-repeat: no-repeat;
background-position: top center;
}
body.dark .gradio-container {
background-image: linear-gradient(rgba(0, 0, 0, 0.5), rgba(0, 0, 0, 1.0)), url("/gradio_api/file=assets/background.jpg");
}
.header-text { text-align: center; text-shadow: 2px 2px 5px #000; }
.header-text h1 { font-size: 2.5em; color: #dc2626; }
.dark .header-text { text-shadow: 2px 2px 5px #FFF; }
.context { text-align: center; color: var(--body-text-color-subdued); }
.context a { color: #dc2626; }
.disclaimer { text-align: center; color: var(--body-text-color-subdued); font-size: 0.9em; padding: 20px; }
.disclaimer ul { list-style: none; padding: 0; }
.disclaimer a { color: #dc2626; }
"""
|