fromtransformersimportAutoModelForCausalLM,AutoTokenizer# (1)!device="cuda"model=AutoModelForCausalLM.from_pretrained("Qwen/Qwen2-7B-Instruct",torch_dtype="auto",device_map="auto")tokenizer=AutoTokenizer.from_pretrained("Qwen/Qwen2-7B-Instruct")# 不同于以前使用 的`model.chat` 方法,现在我们使用 `model.generate` 方法# 但是我们需要先使用 `tokenizer.apply_chat_template` 方法来格式化输入prompt="Give me a short introduction to large language model."messages=[{"role":"system","content":"You are a helpful assistant."},{"role":"user","content":prompt}]text=tokenizer.apply_chat_template(# (2)!messages,tokenize=False,add_generation_prompt=True)model_inputs=tokenizer([text],return_tensors="pt").to(device)# 直接使用 `model.generate` 以及 `tokenizer.decode` 方法获取输出generated_ids=model.generate(# (3)!model_inputs.input_ids,max_new_tokens=512# (4)!)generated_ids=[output_ids[len(input_ids):]# (5)!forinput_ids,output_idsinzip(model_inputs.input_ids,generated_ids)]response=tokenizer.batch_decode(# (6)!generated_ids,skip_special_tokens=True)[0]
curlhttp://localhost:8000/v1/chat/completions\ -H"Content-Type: application/json"\-d'{ "model": "Qwen/Qwen2-7B-Instruct", "messages": [ {"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "Tell me something about large language models."} ] }'
fromopenaiimportOpenAI
# 设置 OpenAI 的 Key 和 API 接口(本地就是 vLLM 启动端口)openai_api_key="EMPTY"openai_api_base="http://localhost:8000/v1"client=OpenAI(api_key=openai_api_key,
base_url=openai_api_base,
)chat_response=client.chat.completions.create(model="Qwen/Qwen2-7B-Instruct",
messages=[{"role":"system","content":"You are a helpful assistant."},
{"role":"user","content":"Tell me something about large language models."},
])print("Chat response:",chat_response)