fromtransformersimportAutoModelForCausalLM,AutoTokenizerdevice="cuda"model=AutoModelForCausalLM.from_pretrained("Qwen/Qwen2-7B-Instruct-GPTQ-Int8",device_map="auto")tokenizer=AutoTokenizer.from_pretrained("Qwen/Qwen2-7B-Instruct-GPTQ-Int8")prompt="Give me a short introduction to large language model."messages=[{"role":"system","content":"You are a helpful assistant."},{"role":"user","content":prompt}]text=tokenizer.apply_chat_template(messages,tokenize=False,add_generation_prompt=True)model_inputs=tokenizer([text],return_tensors="pt").to(device)generated_ids=model.generate(model_inputs.input_ids,max_new_tokens=512)generated_ids=[output_ids[len(input_ids):]forinput_ids,output_idsinzip(model_inputs.input_ids,generated_ids)]response=tokenizer.batch_decode(generated_ids,skip_special_tokens=True)[0]
curlhttp://localhost:8000/v1/chat/completions\-H"Content-Type: application/json"\ -d'{ "model": "Qwen/Qwen2-7B-Instruct-GPTQ-Int8", "messages": [ {"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "Tell me something about large language models."} ], }'
fromopenaiimportOpenAIopenai_api_key="EMPTY"openai_api_base="http://localhost:8000/v1"client=OpenAI(api_key=openai_api_key,base_url=openai_api_base,)chat_response=client.chat.completions.create(model="Qwen/Qwen2-7B-Instruct-GPTQ-Int8",messages=[{"role":"system","content":"You are a helpful assistant."},{"role":"user","content":"Tell me something about large language models."},])print("Chat response:",chat_response)
[{"role":"system","content":"You are a helpful assistant."},{"role":"user","content":"Tell me who you are."},{"role":"assistant","content":"I am a large language model named Qwen..."}]