VLLM量化推理
安装此工具前需安装两个包:
sudo apt-get install cmake
sudo apt-get install pkgconfig
配置huggingface镜像地址:
export HF_ENDPOINT=https://hf-mirror.com
下载代码库, 并安装python依赖
git clone https://github.com/ModelTC/llmc.git
cd llmc/
pip install -r requirements.txt
找到量化方法的配置文件, 并作修改
base:
seed: &seed 42
model:
type: Llama
path: /home/paul/.cache/huggingface/models/models--unsloth--llama-3-8b-Instruct-lawdata
torch_dtype: auto
quant:
method: RTN
weight:
bit: 8
symmetric: True
granularity: per_group
group_size: 128
need_pack: True
eval:
eval_pos: [fake_quant]
name: wikitext2
download: True
path: /home/paul/paulwong/work/workspaces/llmc/dataset
bs: 1
seq_len: 2048
inference_per_block: False
save:
save_vllm: True
save_path: /home/paul/.cache/huggingface/models/models--unsloth--llama-3-8b-Instruct-lawdata-quantization
找到run_llmc.sh, 并作修改
#!/bin/bash
# export CUDA_VISIBLE_DEVICES=0,1
llmc=/home/paul/paulwong/work/workspaces/llmc
export PYTHONPATH=$llmc:$PYTHONPATH
# task_name=awq_w_only
# config=${llmc}/configs/quantization/methods/Awq/awq_w_only.yml
task_name=rtn_for_vllm
config=${llmc}/configs/quantization/backend/vllm/rtn_w8a16.yml
nnodes=1
nproc_per_node=1
find_unused_port() {
while true; do
port=$(shuf -i 10000-60000 -n 1)
if ! ss -tuln | grep -q ":$port "; then
echo "$port"
return 0
fi
done
}
UNUSED_PORT=$(find_unused_port)
MASTER_ADDR=127.0.0.1
MASTER_PORT=$UNUSED_PORT
task_id=$UNUSED_PORT
nohup \
torchrun \
--nnodes $nnodes \
--nproc_per_node $nproc_per_node \
--rdzv_id $task_id \
--rdzv_backend c10d \
--rdzv_endpoint $MASTER_ADDR:$MASTER_PORT \
${llmc}/llmc/__main__.py --config $config --task_id $task_id \
> ${task_name}.log 2>&1 &
sleep 2
ps aux | grep '__main__.py' | grep $task_id | awk '{print $2}' > ${task_name}.pid
# You can kill this program by
# xargs kill -9 < xxx.pid
# xxx.pid is ${task_name}.pid file
执行量化操作