fromtransformersimportAutoTokenizerfromvllmimportLLM,SamplingParamstokenizer=AutoTokenizer.from_pretrained("Qwen/Qwen2-7B-Instruct")# `max_tokens` 用于设置生成的最大长度sampling_params=SamplingParams(temperature=0.7,top_p=0.8,repetition_penalty=1.05,max_tokens=512)# 这里也支持 GPTQ 和 AWQ 模型llm=LLM(model="Qwen/Qwen2-7B-Instruct")prompt="Tell me something about large language models."messages=[{"role":"system","content":"You are a helpful assistant."},{"role":"user","content":prompt}]text=tokenizer.apply_chat_template(messages,tokenize=False,add_generation_prompt=True)outputs=llm.generate([text],sampling_params)foroutputinoutputs:prompt=output.promptgenerated_text=output.outputs[0].textprint(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
curlhttp://localhost:8000/v1/chat/completions\-H"Content-Type: application/json"\ -d'{ "model": "Qwen/Qwen2-7B-Instruct", "messages": [ {"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "Tell me something about large language models."} ], }'
fromopenaiimportOpenAIopenai_api_key="EMPTY"openai_api_base="http://localhost:8000/v1"client=OpenAI(api_key=openai_api_key,base_url=openai_api_base,)chat_response=client.chat.completions.create(model="Qwen/Qwen2-7B-Instruct",messages=[{"role":"system","content":"You are a helpful assistant."},{"role":"user","content":"Tell me something about large language models."},])print("Chat response:",chat_response)