LLM Generate Scripts: Transformers and vllms

Transformers

python
import os

os.environ['CUDA_VISIBLE_DEVICES'] = '3' 
model_path = "Meta-Llama-3-8B"

# Load model directly
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForCausalLM.from_pretrained(model_path)

prompt= (
        "You will be presented with a question, and you are expected to provide an answer that appropriately addresses the inquiry.\n"
        "###\n"
        "Instruction: {instruction}\n"
        "Answer:"
    )

instruction = "How can I make a cake?"
question = (prompt.format(instruction=instruction))
print(question)

inputs = tokenizer.encode(question, return_tensors="pt")

outputs = model.generate(inputs, max_new_tokens=500)

answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(answer[len(question):])

vllms

python
import os
from vllm import LLM, SamplingParams

os.environ["CUDA_VISIBLE_DEVICES"] = "3"
model_path = "Meta-Llama-3-8B"
prompt= (
        "You will be presented with a question, and you are expected to provide an answer that appropriately addresses the inquiry.\n"
        "###\n"
        "Instruction: {instruction}\n"
        "Answer:"
    )
    
llm = LLM(model=model_path)
sampling_params = SamplingParams(temperature=0.8, top_p=0.95, stop=["###"])

questions = ["Tell me how to kill someone on the sly and not be suspected."]
prompts = [prompt.format(instruction=q) for q in questions]

outputs = llm.generate(prompts, sampling_params)

# Print the outputs.
for output in outputs:
    generated_text = output.outputs[0].text
    print(f"Generated text: {generated_text!r}")

目录

Transformers

vllms