bebechien commited on
Commit
aa8d7de
·
verified ·
1 Parent(s): 923b4e9

Upload folder using huggingface_hub

Browse files
.gitattributes CHANGED
@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ assets/background.jpg filter=lfs diff=lfs merge=lfs -text
37
+ transformers-4.57.0.dev0-py3-none-any.whl filter=lfs diff=lfs merge=lfs -text
README.md CHANGED
@@ -1,14 +1,12 @@
1
  ---
2
  title: Hollow Knight Helper
3
- emoji: 🐠
4
  colorFrom: red
5
  colorTo: gray
6
  sdk: gradio
7
  sdk_version: 5.44.1
8
  app_file: app.py
9
  pinned: false
10
- license: cc-by-sa-3.0
11
- short_description: Hollow Knight Helper
12
  ---
13
 
14
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
  title: Hollow Knight Helper
3
+ emoji: 🕸️
4
  colorFrom: red
5
  colorTo: gray
6
  sdk: gradio
7
  sdk_version: 5.44.1
8
  app_file: app.py
9
  pinned: false
 
 
10
  ---
11
 
12
+ An example chatbot using [Gradio](https://gradio.app) and Gemma.
app.py ADDED
@@ -0,0 +1,274 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import requests
3
+ import os
4
+ import pickle
5
+ import spaces
6
+ from bs4 import BeautifulSoup
7
+ from html_to_markdown import convert_to_markdown
8
+ from huggingface_hub import login
9
+ from sentence_transformers import SentenceTransformer
10
+ from transformers import pipeline, TextIteratorStreamer
11
+ from threading import Thread
12
+ from tqdm import tqdm
13
+
14
+ # --- 1. CONFIGURATION ---
15
+ # Centralized place for all settings and constants.
16
+
17
+ # Hugging Face & Model Configuration
18
+ HF_TOKEN = os.getenv('HF_TOKEN')
19
+ EMBEDDING_MODEL_ID = "google/embeddinggemma-300M"
20
+ LLM_MODEL_ID = "google/gemma-3-12B-it"
21
+
22
+ # Data Source Configuration
23
+ BASE_URL = "https://hollowknight.wiki"
24
+
25
+ # Hollow Knight Boss Data
26
+ ENTRY_POINT_HOLLOW_KNIGHT = "/w/Category:Bosses_(Hollow_Knight)"
27
+ CACHE_FILE_HOLLOW_KNIGHT = "hollow_knight_boss.pkl"
28
+
29
+ # Silksong Boss Data
30
+ ENTRY_POINT_SILKSONG = "/w/Category:Bosses_(Silksong)"
31
+ CACHE_FILE_SILKSONG = "silksong_boss.pkl"
32
+
33
+ # Gradio App Configuration
34
+ DEFAULT_SIMILARITY_THRESHOLD = 0.5
35
+ DEFAULT_MESSAGE_NO_MATCH = "I'm sorry, I can't find a relevant document to answer that question. Try asking about a specific boss in Hollow Knight."
36
+
37
+
38
+ # --- 2. HELPER FUNCTIONS ---
39
+ # Reusable functions for web scraping and data processing.
40
+
41
+ def get_html(url: str) -> str:
42
+ """Fetches HTML content from a URL."""
43
+ try:
44
+ response = requests.get(url)
45
+ response.raise_for_status() # Raises an HTTPError for bad responses (4xx or 5xx)
46
+ return response.text
47
+ except requests.exceptions.RequestException as e:
48
+ print(f"Error fetching {url}: {e}")
49
+ return ""
50
+
51
+ def find_wiki_links(html_content: str) -> list[str]:
52
+ """Parses HTML to find all boss links within the 'mw-pages' div."""
53
+ soup = BeautifulSoup(html_content, 'html.parser')
54
+ mw_pages_div = soup.find('div', id='mw-pages')
55
+ if not mw_pages_div:
56
+ return []
57
+ return [a['href'] for a in mw_pages_div.find_all('a', href=True)]
58
+
59
+ def get_markdown_from_url(url: str) -> str:
60
+ """Fetches and converts a webpage's content to Markdown."""
61
+ html = get_html(url)
62
+ if not html:
63
+ return ""
64
+ soup = BeautifulSoup(html, 'html.parser')
65
+ # Assuming convert_to_markdown correctly processes the soup object
66
+ return convert_to_markdown(soup)
67
+
68
+
69
+ # --- 3. DATA PROCESSING & CACHING ---
70
+ # Scrapes data and generates embeddings, using a cache to avoid re-running.
71
+
72
+ def load_or_process_source(entry_point: str, cache_file: str, label: str, embedding_model):
73
+ """
74
+ Loads processed data from a cache file if it exists. Otherwise, scrapes,
75
+ processes, generates embeddings, and saves to the cache.
76
+ """
77
+ if os.path.exists(cache_file):
78
+ print(f"✅ Found cache for {label}. Loading data from '{cache_file}'...")
79
+ with open(cache_file, 'rb') as f:
80
+ return pickle.load(f)
81
+
82
+ print(f"ℹ️ No cache for {label}. Starting data scraping and processing...")
83
+ main_page_html = get_html(BASE_URL + entry_point)
84
+ extracted_links = find_wiki_links(main_page_html)
85
+
86
+ contents = {"titles": [], "texts": [], "embeddings": []}
87
+
88
+ for doc_path in tqdm(extracted_links, desc=f"Processing {label} Pages"):
89
+ full_url = BASE_URL + doc_path
90
+ original_text = get_markdown_from_url(full_url)
91
+
92
+ # Trim text from the "References" section onwards for cleaner context
93
+ text = original_text.split("References\n----------\n", 1)[0].strip()
94
+
95
+ if text:
96
+ contents["titles"].append(doc_path.split('/')[-1])
97
+ contents["texts"].append(text)
98
+ # Generate and add embedding
99
+ embedding = embedding_model.encode(text, prompt=f"title: {doc_path.split('/')[-1]} | text: ")
100
+ contents["embeddings"].append(embedding)
101
+
102
+ print(f"✅ {label} processing complete. Saving data to '{cache_file}'...")
103
+ with open(cache_file, 'wb') as f:
104
+ pickle.dump(contents, f)
105
+
106
+ return contents
107
+
108
+
109
+ # --- 4. CORE AI LOGIC ---
110
+ # Functions for finding context and generating a response.
111
+
112
+ def find_best_context(model, query: str, contents: dict, similarity_threshold: float):
113
+ """Finds the most relevant document text based on semantic similarity."""
114
+ if not query or not contents["embeddings"]:
115
+ return None
116
+
117
+ query_embedding = model.encode(query, prompt_name="query")
118
+ similarities = model.similarity(query_embedding, contents["embeddings"])
119
+
120
+ best_index = similarities.argmax().item()
121
+ best_score = similarities[0, best_index].item()
122
+
123
+ print(best_score)
124
+ if best_score >= similarity_threshold:
125
+ return contents["texts"][best_index]
126
+ return None
127
+
128
+ context = None
129
+
130
+ @spaces.GPU
131
+ def respond(message: str, history: list, similarity_threshold: float):
132
+ """Generates a streaming response from the LLM based on the best context found."""
133
+ global context
134
+ if (context := find_best_context(embedding_model, message, combined_contents, similarity_threshold) or context):
135
+ # SUCCESS: A valid context was found and has been saved.
136
+ pass
137
+ else:
138
+ # FAILURE: No context is available.
139
+ yield DEFAULT_MESSAGE_NO_MATCH
140
+ return
141
+
142
+ system_prompt = f"Answer the following QUESTION based only on the CONTEXT provided. If the answer cannot be found in the CONTEXT, write \"I don't know.\"\n---\nCONTEXT:\n{context}\n"
143
+ user_prompt = f"QUESTION:\n{message}"
144
+
145
+ messages = [{"role": "system", "content": system_prompt}]
146
+ messages.extend(history)
147
+ messages.append({"role": "user", "content": user_prompt})
148
+
149
+ for item in messages:
150
+ print(item['role'])
151
+ print(item['content'])
152
+
153
+ streamer = TextIteratorStreamer(llm_pipeline.tokenizer, skip_prompt=True, skip_special_tokens=True)
154
+
155
+ thread = Thread(
156
+ target=llm_pipeline,
157
+ kwargs=dict(
158
+ text_inputs=messages,
159
+ streamer=streamer,
160
+ max_new_tokens=512,
161
+ do_sample=True,
162
+ top_p=0.95,
163
+ )
164
+ )
165
+ thread.start()
166
+
167
+ response = ""
168
+ for new_text in streamer:
169
+ response += new_text
170
+ yield response
171
+
172
+
173
+ # --- 5. INITIALIZATION ---
174
+ # Login, load models, and process data.
175
+
176
+ print("Logging into Hugging Face Hub...")
177
+ login(token=HF_TOKEN)
178
+
179
+ print("Initializing embedding model...")
180
+ embedding_model = SentenceTransformer(EMBEDDING_MODEL_ID)
181
+
182
+ print("Initializing language model...")
183
+ llm_pipeline = pipeline(
184
+ "text-generation",
185
+ model=LLM_MODEL_ID,
186
+ device_map="auto",
187
+ dtype="auto",
188
+ )
189
+
190
+ print("\n--- Processing Game Data ---")
191
+ hk_contents = load_or_process_source(
192
+ ENTRY_POINT_HOLLOW_KNIGHT, CACHE_FILE_HOLLOW_KNIGHT, "Hollow Knight", embedding_model
193
+ )
194
+ silksong_contents = load_or_process_source(
195
+ ENTRY_POINT_SILKSONG, CACHE_FILE_SILKSONG, "Silksong", embedding_model
196
+ )
197
+
198
+ print("\nCombining data sources...")
199
+ combined_contents = {
200
+ "titles": hk_contents["titles"] + silksong_contents["titles"],
201
+ "texts": hk_contents["texts"] + silksong_contents["texts"],
202
+ "embeddings": hk_contents["embeddings"] + silksong_contents["embeddings"],
203
+ }
204
+ print(f"✅ Total documents processed: {len(combined_contents['texts'])}")
205
+
206
+
207
+ # --- 6. GRADIO UI ---
208
+ # Defines the web interface for the chatbot.
209
+ gr.set_static_paths(paths=["assets/"])
210
+
211
+ # Theme and CSS for the Silksong aesthetic
212
+ silksong_theme = gr.themes.Default(
213
+ primary_hue=gr.themes.colors.red,
214
+ secondary_hue=gr.themes.colors.zinc,
215
+ neutral_hue=gr.themes.colors.zinc,
216
+ font=[gr.themes.GoogleFont("IM Fell English"), "ui-sans-serif", "system-ui", "sans-serif"],
217
+ )
218
+
219
+ silksong_css="""
220
+ .gradio-container {
221
+ background-image: linear-gradient(rgba(255,255,255, 0.5), rgba(255, 255, 255, 1.0)), url("/gradio_api/file=assets/background.jpg");
222
+ background-size: cover;
223
+ background-repeat: no-repeat;
224
+ background-position: center;
225
+ }
226
+ body.dark .gradio-container {
227
+ background-image: linear-gradient(rgba(0, 0, 0, 0.5), rgba(0, 0, 0, 1.0)), url("/gradio_api/file=assets/background.jpg");
228
+ }
229
+ .header-text { text-align: center; text-shadow: 2px 2px 5px #000; }
230
+ .header-text h1 { font-size: 2.5em; color: #dc2626; }
231
+ .dark .header-text { text-shadow: 2px 2px 5px #FFF; }
232
+ .disclaimer { text-align: center; color: var(--body-text-color-subdued); font-size: 0.9em; padding: 20px; }
233
+ .disclaimer ul { list-style: none; padding: 0; }
234
+ .disclaimer a { color: #dc2626; }
235
+ """
236
+
237
+ with gr.Blocks(theme=silksong_theme, css=silksong_css) as demo:
238
+ gr.HTML("""
239
+ <div class="header-text">
240
+ <h1>A Weaver's Counsel</h1>
241
+ <p>Speak, little traveler. What secrets of Pharloom do you seek?</p>
242
+ <p style="font-style: italic;">(Note: This bot currently only has knowledge about bosses)</p>
243
+ </div>
244
+ """)
245
+
246
+ gr.ChatInterface(
247
+ respond,
248
+ type="messages",
249
+ chatbot=gr.Chatbot(type="messages", label=LLM_MODEL_ID),
250
+ textbox=gr.Textbox(placeholder="Ask about the haunted kingdom...", container=False, submit_btn=True, scale=7),
251
+ additional_inputs=[
252
+ gr.Slider(minimum=0.1, maximum=1.0, value=DEFAULT_SIMILARITY_THRESHOLD, step=0.1, label="Similarity Threshold"),
253
+ ],
254
+ examples=[
255
+ ["Where can I find the Moorwing?", DEFAULT_SIMILARITY_THRESHOLD],
256
+ ["Who is the voice of Lace?", DEFAULT_SIMILARITY_THRESHOLD],
257
+ ["How can I beat the False Knight?", DEFAULT_SIMILARITY_THRESHOLD],
258
+ ["What achievement for Hornet Protector?", DEFAULT_SIMILARITY_THRESHOLD],
259
+ ],
260
+ )
261
+
262
+ gr.HTML("""
263
+ <div class="disclaimer">
264
+ <p><strong>Disclaimer:</strong></p>
265
+ <ul style="list-style: none; padding: 0;">
266
+ <li>This is a fan-made personal demonstration and not affiliated with any organization.<br>The bot is for entertainment purposes only.</li>
267
+ <li>Factual information is sourced from the <a href="https://hollowknight.wiki" target="_blank">Hollow Knight Wiki</a>.<br>Content is available under <a href="https://creativecommons.org/licenses/by-sa/3.0/" target="_blank">Commons Attribution-ShareAlike</a> unless otherwise noted.</li>
268
+ <li>Built by <a href="https://huggingface.co/bebechien" target="_blank">bebechien</a> with a 💖 for the world of Hollow Knight.</li>
269
+ </ul>
270
+ </div>
271
+ """)
272
+
273
+ if __name__ == "__main__":
274
+ demo.launch()
assets/background.jpg ADDED

Git LFS Details

  • SHA256: e2bcd4a86e7e95206d0dd4c99be99e4b00e565e472d9c454b3338f1a1589ae54
  • Pointer size: 131 Bytes
  • Size of remote file: 352 kB
hollow_knight_boss.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:608417424fb5f9670689cb318868bc19ddba6a524fa6df8f3d43c47393e65a13
3
+ size 976095
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ accelerate
2
+ beautifulsoup4
3
+ html_to_markdown
4
+ sentence-transformers
5
+ git+https://github.com/huggingface/[email protected]
silksong_boss.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b4e7d57a2234046677a429cae5143d55812de509660a12a5225afcf576e539b7
3
+ size 66657