File size: 7,216 Bytes
da2da03
 
 
 
 
 
 
 
 
 
 
1eca919
da2da03
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
75684b1
 
 
 
 
 
 
 
 
 
 
 
 
da2da03
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7b15c89
da2da03
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
import os
import gradio as gr
import pickle
import torch
from tqdm import tqdm

from web_helper import get_html, find_wiki_links, get_markdown_from_html, get_markdown_from_url

# --- Hugging Face & Model Configuration ---
HF_TOKEN = os.getenv('HF_TOKEN')
EMBEDDING_MODEL_ID = "google/embeddinggemma-300M"
LLM_MODEL_ID = "google/gemma-3-12B-it"

# --- Data Source Configuration ---
BASE_URL = "https://hollowknight.wiki"
GAME_KNOWLEDGE_DATA = [
    {
        "title": "Hollow Knight",
        "cache_folder": "1_cache",
        "category_list": [
            {
                "entry": "/w/Category:Bosses_(Hollow_Knight)",
                "cache": "hollow_knight_bosses.pkl",
                "label": "Bosses",
            },
        ],
    },
    {
        "title": "Silksong",
        "cache_folder": "2_cache",
        "category_list": [
            {
                "entry": "/w/Hornet_(Silksong)",
                "cache": "silksong_hornet.pkl",
                "label": "General",
            },
            {
                "entry": "/w/Hollow_Knight:_Silksong",
                "cache": "silksong_game.pkl",
                "label": "General",
            },
            {
                "entry": "/w/Category:Areas_(Silksong)",
                "cache": "silksong_areas.pkl",
                "label": "Areas",
            },
            {
                "entry": "/w/Category:Bosses_(Silksong)",
                "cache": "silksong_bosses.pkl",
                "label": "Bosses",
            },
            {
                "entry": "/w/Category:Items_(Silksong)",
                "cache": "silksong_items.pkl",
                "label": "Items",
            },
            {
                "entry": "/w/Category:NPCs_(Silksong)",
                "cache": "silksong_npcs.pkl",
                "label": "NPCs",
            },
            {
                "entry": "/w/Tasks",
                "cache": "silksong_tasks.pkl",
                "label": "Tasks",
            },
            {
                "entry": "/w/Category:Crests_and_Skills",
                "cache": "silksong_crests_and_skills.pkl",
                "label": "Crests and Skills",
            },
            {
                "entry": "/w/Category:Tools",
                "cache": "silksong_tools.pkl",
                "label": "Tools",
            },
            {
                "entry": "/w/Category:Abilities_(Silksong)",
                "cache": "silksong_abilities.pkl",
                "label": "Abilities",
            },
        ],
    },
]

def get_all_game_data(embedding_model):
    """Loops through the config and processes/loads all knowledge sources."""
    print("\n--- Processing Game Data ---")
    knowledge_base = {}

    for item in GAME_KNOWLEDGE_DATA:
        title = item['title']
        knowledge_base[title] = []
        for category in item['category_list']:
            cache_path = f"""{item["cache_folder"]}/{category["cache"]}"""
            knowledge_base[title] += _load_or_process_source(
                category['entry'], 
                cache_path, 
                category['label'],
                embedding_model
            )
    
    return knowledge_base

# --- DATA PROCESSING & CACHING ---
# Scrapes data and generates embeddings, using a cache to avoid re-running.
def _clean_text(text: str) -> str:
    """Removes the references section from the raw text."""
    return text.split("References\n----------\n", 1)[0].strip()

@torch.no_grad()
def _create_data_entry(text: str, doc_path: str, label: str, embedding_model) -> dict | None:
    """Creates a single structured data entry with text, metadata, and embedding."""
    cleaned_text = _clean_text(text)
    if not cleaned_text:
        return None

    title = doc_path.split('/')[-1]
    # Encode returns a numpy array; convert to tensor for stacking later.
    embedding = embedding_model.encode(cleaned_text, prompt=f"title: {title} | text: ")
    return {
        "text": cleaned_text,
        "embedding": torch.tensor(embedding), ### Store as tensor for faster processing
        "metadata": {
            "category": label,
            "source": BASE_URL + doc_path,
            "title": title
        }
    }

def _load_or_process_source(entry_point: str, cache_file: str, label: str, embedding_model):
    """

    Loads processed data from a cache file if it exists. Otherwise, scrapes,

    processes, generates embeddings, and saves to the cache.

    """
    if os.path.exists(cache_file):
        print(f"✅ Found cache for {label}. Loading data from '{cache_file}'...")
        with open(cache_file, 'rb') as f:
            return pickle.load(f)

    print(f"ℹ️ No cache for {label}. Starting data scraping and processing...")
    processed_data = []

    main_page_html = get_html(BASE_URL + entry_point)
    data_entry = _create_data_entry(get_markdown_from_html(main_page_html), entry_point, label, embedding_model)
    if (data_entry):
        processed_data.append(data_entry)

    extracted_links = find_wiki_links(main_page_html)

    for doc_path in tqdm(extracted_links, desc=f"Processing {label} Pages"):
        full_url = BASE_URL + doc_path
        text = get_markdown_from_url(full_url)

        data_entry = _create_data_entry(text, doc_path, label, embedding_model)
        if data_entry:
            processed_data.append(data_entry)

    print(f"✅ {label} processing complete. Saving {len(processed_data)} entries to '{cache_file}'...")
    os.makedirs(os.path.dirname(cache_file), exist_ok=True)
    with open(cache_file, 'wb') as f:
        pickle.dump(processed_data, f)

    return processed_data

# --- App Logic Configuration ---
BASE_SIMILARITY_THRESHOLD = 0.2
FOLLOWUP_SIMILARITY_THRESHOLD = 0.5
DEFAULT_MESSAGE_NO_MATCH = "I'm sorry, I can't find a relevant document to answer that question."


# --- Gradio UI Configuration ---
silksong_theme = gr.themes.Default(
    primary_hue=gr.themes.colors.red,
    secondary_hue=gr.themes.colors.zinc,
    neutral_hue=gr.themes.colors.zinc,
    font=[gr.themes.GoogleFont("IM Fell English"), "ui-sans-serif", "system-ui", "sans-serif"],
)

silksong_css="""

.gradio-container {

    background-image: linear-gradient(rgba(255,255,255, 0.5), rgba(255, 255, 255, 1.0)), url("/gradio_api/file=assets/background.jpg");

    background-size: 100%;

    background-repeat: no-repeat;

    background-position: top center;

}

body.dark .gradio-container {

    background-image: linear-gradient(rgba(0, 0, 0, 0.5), rgba(0, 0, 0, 1.0)), url("/gradio_api/file=assets/background.jpg");

}

.header-text { text-align: center; text-shadow: 2px 2px 5px #000; }

.header-text h1 { font-size: 2.5em; color: #dc2626; }

.dark .header-text { text-shadow: 2px 2px 5px #FFF; }

.context { text-align: center; color: var(--body-text-color-subdued); }

.context a { color: #dc2626; }

.disclaimer { text-align: center; color: var(--body-text-color-subdued); font-size: 0.9em; padding: 20px; }

.disclaimer ul { list-style: none; padding: 0; }

.disclaimer a { color: #dc2626; }

"""