Ubuntu下OpenELM安装与运行指南
一 环境准备
conda create -n openelm python=3.10 -y
conda activate openelm
pip install torch==2.1.1 torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121
pip install transformers==4.36.2 tokenizers==0.15.2 sentencepiece==0.2.0 accelerate==0.25.0curl -s https://packagecloud.io/install/repositories/github/git-lfs/script.deb.sh | sudo bash
sudo apt-get install git-lfs
git lfs install
# 在 https://huggingface.co/settings/tokens 创建令牌二 安装与获取模型
pip install transformers datasets在 Python 中加载与推理:
from transformers import AutoModelForCausalLM, AutoTokenizer
model_name = "apple/OpenELM-3B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)
prompt = "Once upon a time there was"
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
outputs = model.generate(**inputs, max_length=64, temperature=0.7, do_sample=True)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))若仓库为私有或较大,需登录 Hugging Face:
huggingface-cli login
# 或在代码中传入 use_auth_token="YOUR_HF_TOKEN"git clone https://gitcode.com/mirrors/apple/OpenELM-3B-Instruct
cd OpenELM-3B-Instruct
ls -lh | grep "model-.*.safetensors" # 应看到两个分片权重文件然后在 Transformers 中通过本地路径加载:
model = AutoModelForCausalLM.from_pretrained("/abs/path/OpenELM-3B-Instruct")三 运行与性能优化
model = model.to("cuda")
inputs = tokenizer(prompt, return_tensors="pt").to("cuda")device = "mps" if torch.backends.mps.is_available() else "cpu"
model = model.to(device)
inputs = tokenizer(prompt, return_tensors="pt").to(device)pip install bitsandbytes
model = AutoModelForCausalLM.from_pretrained(
model_name,
load_in_8bit=True,
device_map="auto"
)可通过降低 max_length、使用 量化、开启 FlashAttention-2(若可用) 等方式进一步优化。
四 Docker容器化部署
docker pull nvidia/cuda:12.1.1-cudnn8-runtime-ubuntu22.04
mkdir -p /data/web/disk1/git_repo/mirrors/apple/OpenELM-3B-Instruct
cd /data/web/disk1/git_repo/mirrors/apple/OpenELM-3B-Instruct
docker run -it --gpus all \
-v $(pwd):/workspace \
-p 7860:7860 \
--name openelm-deploy \
nvidia/cuda:12.1.1-cudnn8-runtime-ubuntu22.04 /bin/bash容器内创建虚拟环境并安装依赖(见上文),随后即可运行推理脚本或启动 Gradio/API 服务。
五 常见问题与排查