Spaces:

contenteaseAI
/

GGUF_Model

Runtime error

App Files Files Community

contenteaseAI commited on Jul 19, 2024

Commit

ed5c12c

verified ·

1 Parent(s): 8a9e8c8

Upload app.py

Browse files

Files changed (1) hide show

app.py +65 -93

app.py CHANGED Viewed

@@ -4,25 +4,31 @@ import subprocess
 from llama_cpp import Llama
 from llama_cpp_agent import LlamaCppAgent, MessagesFormatterType
 from llama_cpp_agent.providers import LlamaCppPythonProvider
-from llama_cpp_agent.chat_history import BasicChatHistory
-from llama_cpp_agent.chat_history.messages import Roles
 import gradio as gr
 from huggingface_hub import hf_hub_download
-hf_hub_download(
-    repo_id="bartowski/gemma-2-9b-it-GGUF",
-    filename="gemma-2-9b-it-Q5_K_M.gguf",
-    local_dir="./models"
-)
-hf_hub_download(
-    repo_id="bartowski/gemma-2-27b-it-GGUF",
-    filename="gemma-2-27b-it-Q5_K_M.gguf",
-    local_dir="./models"
-)
 llm = None
 llm_model = None
@@ -30,16 +36,12 @@ llm_model = None
 @spaces.GPU(duration=120)
 def respond(
     message,
-    history: list[tuple[str, str]],
     model,
     system_message,
     max_tokens,
     temperature,
-    top_p,
-    top_k,
-    repeat_penalty,
 ):
-    chat_template = MessagesFormatterType.GEMMA_2
     global llm
     global llm_model
@@ -48,7 +50,7 @@ def respond(
         llm = Llama(
             model_path=f"models/{model}",
             flash_attn=True,
-            n_gpu_layers=81,
             n_batch=1024,
             n_ctx=8192,
         )
@@ -65,30 +67,12 @@ def respond(
     settings = provider.get_provider_default_settings()
     settings.temperature = temperature
-    settings.top_k = top_k
-    settings.top_p = top_p
     settings.max_tokens = max_tokens
-    settings.repeat_penalty = repeat_penalty
     settings.stream = True
-    messages = BasicChatHistory()
-    for msn in history:
-        user = {
-            'role': Roles.user,
-            'content': msn[0]
-        }
-        assistant = {
-            'role': Roles.assistant,
-            'content': msn[1]
-        }
-        messages.add_message(user)
-        messages.add_message(assistant)
     stream = agent.get_chat_response(
         message,
         llm_sampling_settings=settings,
-        chat_history=messages,
         returns_streaming_generator=True,
         print_output=False
     )
@@ -98,64 +82,52 @@ def respond(
         outputs += output
         yield outputs
-description = """<p align="center"><a href="https://huggingface.co/spaces/gokaygokay/Gemma-2-llamacpp" target="_blank">[Reference Space]</a></p>
-<p><center>
-<p align="center">Defaults to 9B it (you can switch to other from additional inputs)</p>
-<p><center>
-<a href="https://huggingface.co/google/gemma-2-27b-it" target="_blank">[27B it Model]</a>
-<a href="https://huggingface.co/google/gemma-2-9b-it" target="_blank">[9B it Model]</a>
-<a href="https://huggingface.co/bartowski/gemma-2-27b-it-GGUF" target="_blank">[27B it Model GGUF]</a>
-<a href="https://huggingface.co/bartowski/gemma-2-9b-it-GGUF" target="_blank">[9B it Model GGUF]</a>
-</center></p>
 """
-demo = gr.ChatInterface(
-    respond,
-    additional_inputs=[
-        gr.Dropdown([
-                'gemma-2-9b-it-Q5_K_M.gguf',
-                'gemma-2-27b-it-Q5_K_M.gguf'
-            ],
-            value="gemma-2-9b-it-Q5_K_M.gguf",
-            label="Model"
-        ),
-        gr.Textbox(value="You are a helpful assistant.", label="System message"),
-        gr.Slider(minimum=1, maximum=4096, value=2048, step=1, label="Max tokens"),
-        gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
-        gr.Slider(
-            minimum=0.1,
-            maximum=1.0,
-            value=0.95,
-            step=0.05,
-            label="Top-p",
-        ),
-        gr.Slider(
-            minimum=0,
-            maximum=100,
-            value=40,
-            step=1,
-            label="Top-k",
-        ),
-        gr.Slider(
-            minimum=0.0,
-            maximum=2.0,
-            value=1.1,
-            step=0.1,
-            label="Repetition penalty",
-        ),
-    ],
-    retry_btn="Retry",
-    undo_btn="Undo",
-    clear_btn="Clear",
-    submit_btn="Send",
-    title="Chat with Gemma 2 using llama.cpp",
-    description=description,
-    chatbot=gr.Chatbot(
-        scale=1,
-        likeable=False,
-        show_copy_button=True
     )
-)
 if __name__ == "__main__":
-    demo.launch()

 from llama_cpp import Llama
 from llama_cpp_agent import LlamaCppAgent, MessagesFormatterType
 from llama_cpp_agent.providers import LlamaCppPythonProvider
 import gradio as gr
 from huggingface_hub import hf_hub_download
+import logging
+import time
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+repo_id = "QuantFactory/Meta-Llama-3-8B-Instruct-GGUF"
+filename = "Meta-Llama-3-8B-Instruct.Q8_0.gguf"
+try:
+        start_time = time.time()
+        logger.info("Downloading Model....")
+        hf_hub_download(
+            repo_id = repo_id ,
+            filename = filename,
+            local_dir="./model"
+        )
+        end_time = time.time()
+        logger.info(f"Download complete. Time taken : {start_time - end_time} seconds.")
+except Exception as e:
+    logger.error(f"Unable to download Model : {e}")
+    raise
 llm = None
 llm_model = None
 @spaces.GPU(duration=120)
 def respond(
     message,
     model,
     system_message,
     max_tokens,
     temperature,
 ):
+    chat_template = MessagesFormatterType.LLAMA_3
     global llm
     global llm_model
         llm = Llama(
             model_path=f"models/{model}",
             flash_attn=True,
+            n_gpu_layers=-1,
             n_batch=1024,
             n_ctx=8192,
         )
     settings = provider.get_provider_default_settings()
     settings.temperature = temperature
     settings.max_tokens = max_tokens
     settings.stream = True
     stream = agent.get_chat_response(
         message,
         llm_sampling_settings=settings,
         returns_streaming_generator=True,
         print_output=False
     )
         outputs += output
         yield outputs
+DESCRIPTION = '''
+<div>
+<h1 style="text-align: center;">ContenteaseAI custom trained model</h1>
+</div>
+'''
+LICENSE = """
+<p/>
+---
+For more information, visit our [website](https://contentease.ai).
+"""
+PLACEHOLDER = """
+<div style="padding: 30px; text-align: center; display: flex; flex-direction: column; align-items: center;">
+   <h1 style="font-size: 28px; margin-bottom: 2px; opacity: 0.55;">ContenteaseAI Custom AI trained model</h1>
+   <p style="font-size: 18px; margin-bottom: 2px; opacity: 0.65;">Enter the text extracted from the PDF:</p>
+</div>
+"""
+css = """
+h1 {
+  text-align: center;
+  display: block;
+}
 """
+# Gradio block
+chatbot = gr.Chatbot(height=450, placeholder=PLACEHOLDER, label='Gradio ChatInterface')
+with gr.Blocks(fill_height=True, css=css) as demo:
+    gr.Markdown(DESCRIPTION)
+    gr.ChatInterface(
+        fn=respond,
+        chatbot=chatbot,
+        fill_height=True,
+        additional_inputs_accordion=gr.Accordion(label="⚙️ Parameters", open=False, render=False),
+        additional_inputs=[
+            gr.Slider(minimum=0, maximum=1, step=0.1, value=0.95, label="Temperature", render=False),
+            gr.Slider(minimum=128, maximum=2000, step=1, value=700, label="Max new tokens", render=False),
+        ]
     )
+    gr.Markdown(LICENSE)
 if __name__ == "__main__":
+    try:
+        demo.launch(show_error=True, debug = True)
+    except Exception as e:
+        logger.error(f"Error launching Gradio demo: {e}")