llama-3.2-peft / app.py
baldassarre's picture
Update app.py
bd57b78 verified
import gradio as gr
import transformers
model = transformers.AutoModelForCausalLM.from_pretrained(
"baldassarre/llama-3.2-peft-Q4_K_M-GGUF",
dtype="float16",
gguf_file="llama-3.2-peft-q4_k_m.gguf",
)
tokenizer = transformers.AutoTokenizer.from_pretrained("unsloth/Llama-3.2-1B-Instruct")
def generate(prompt, max_new_tokens):
inputs = tokenizer.apply_chat_template(
[
{"role": "user", "content": prompt},
],
add_generation_prompt=True,
tokenize=True,
return_tensors="pt",
return_dict=True,
)
outputs = model.generate(
**inputs,
max_new_tokens=max_new_tokens,
logits_processor=[
transformers.TemperatureLogitsWarper(temperature=1.5),
transformers.MinPLogitsWarper(min_p=0.1),
],
)
txt = tokenizer.decode(outputs[0])
txt = txt.split("<|start_header_id|>assistant<|end_header_id|>\n\n", maxsplit=1)[1]
txt = txt.split("<|eot_id|>", maxsplit=1)[0]
return txt
demo = gr.Interface(
fn=generate,
inputs=[
gr.Textbox(placeholder="What is ...", label="Question"),
gr.Slider(10, 500, value=100, label="Max new tokens"),
],
outputs=gr.Textbox(label="Answer", interactive=False),
title="PEFT Model",
examples=[
["What are the ingredients for pizza?", 200],
["Which city is famous for tortellini?", 20],
["Who invented the radio?", 20],
["Explain how to load a dishwasher.", 400],
]
)
demo.launch()