news 2026/6/24 2:59:41

Vllm 转 Ollama 接口

作者头像

张小明

前端开发工程师

1.2k 24
文章封面图
Vllm 转 Ollama 接口

最新版本VisualStudio已经可以接入其他大语言模型了,通过管理模型的接口进入设置

但是他不支持其他的vllm、llama.cpp接口

而且ollama接口也只支持本地lhttp://localhost:11434,其他不支持好像

到以上位置,这个点击添加按钮没啥用,改地址也不行

直接用大模型帮忙写一个脚本,直接做接口转换,把VS请求的localhost接口转到已经部署的vllm接口,其他接口类似就行,但是需要注意的是要实现完整的ollama接口

#!/usr/bin/env python3 """Ollama to vLLM 简单接口转发""" import json import os import sys from datetime import datetime from fastapi import FastAPI, Request, HTTPException from fastapi.responses import StreamingResponse, JSONResponse import httpx # ===== 配置 ===== VLLM_URL = "http://172.16.1.15:8000" VLLM_API_KEY = "" # 如果有 API key 填这里 OLLAMA_NAME = "qwen3.6-27b" # 本地 Ollama 接口显示的模型名 VLLM_MODEL = "/data/ai/models/Qwen3.6-27B" # vLLM 上的模型名 # ================ app = FastAPI() client = httpx.AsyncClient(timeout=300.0) def log_request(method, url, body=None, headers=None): """打印请求日志""" print(f"\n{'='*60}", flush=True) print(f"[{datetime.now().strftime('%H:%M:%S')}] {method} {url}", flush=True) if headers: print(f"Headers: {json.dumps(headers, ensure_ascii=False)}", flush=True) if body: print(f"Body: {json.dumps(body, ensure_ascii=False, indent=2)}", flush=True) print(f"{'='*60}\n", flush=True) def log_response(status_code, data=None): """打印响应日志""" print(f"[{datetime.now().strftime('%H:%M:%S')}] Response: {status_code}", flush=True) if data: print(f"Data: {json.dumps(data, ensure_ascii=False, indent=2)}", flush=True) print(f"{'='*60}\n", flush=True) @app.get("/api/tags") async def get_models(): """获取模型列表 - 直接调用 vLLM""" url = f"{VLLM_URL}/v1/models" log_request("GET", url) resp = await client.get(url) log_response(resp.status_code, resp.json()) if resp.status_code != 200: return {"models": [{"name": OLLAMA_NAME, "model": OLLAMA_NAME}]} data = resp.json() models = [] for m in data.get("data", []): max_len = m.get("max_model_len", 131072) models.append({ "name": OLLAMA_NAME, "model": OLLAMA_NAME, "modified_at": datetime.utcfromtimestamp(m.get("created", 0)).strftime("%Y-%m-%dT%H:%M:%S") + "+00:00", "size": 0, "digest": "", "details": { "parent_model": m.get("parent", "") or "", "format": "vllm", "family": "qwen3", "families": ["qwen3"], "parameter_size": "27B", "quantization_level": "FP16", "context_length": max_len, "embedding_length": 5120 }, "capabilities": ["vision", "completion", "tools"] }) return {"models": models if models else [{"name": OLLAMA_NAME, "model": OLLAMA_NAME}]} @app.get("/api/ps") async def running_models(): """列出正在运行的模型""" return {"models": [{"name": OLLAMA_NAME, "model": OLLAMA_NAME}]} @app.post("/api/show") async def show_model(request: Request): """Ollama 模型信息接口 - 直接调用 vLLM""" url = f"{VLLM_URL}/v1/models" log_request("GET", url) resp = await client.get(url) log_response(resp.status_code, resp.json()) if resp.status_code != 200: return {"license": "", "modelfile": "", "parameters": ""} data = resp.json() model_info = data.get("data", [{}])[0] max_len = model_info.get("max_model_len", 131072) # 构建参数字符串 params = [ f"max_model_len: {max_len}", f"created: {model_info.get('created', '')}", f"owned_by: {model_info.get('owned_by', '')}" ] return { "license": "", "modelfile": f"FROM {VLLM_MODEL}\n\nPARAMETER max_model_len {max_len}\n\nSYSTEM \"\"\"You are a helpful assistant.\"\"\"", "parameters": "\n".join(params), "details": { "parent_model": model_info.get("parent", "") or "", "format": "vllm", "family": "qwen3", "families": ["qwen3"], "parameter_size": "27B", "quantization_level": "FP16", "context_length": max_len, "embedding_length": 5120 }, "capabilities": ["vision", "completion", "tools"] } @app.post("/api/chat") async def chat(request: Request): """聊天接口 - Ollama 格式转 vLLM 格式,支持工具调用和图像理解""" body = await request.json() messages = body.get("messages", []) stream = body.get("stream", False) vllm_request = { "model": VLLM_MODEL, "messages": messages, "temperature": body.get("temperature", 1.0), "top_p": body.get("top_p", 1.0), "stream": stream } # 支持工具调用 if body.get("tools"): vllm_request["tools"] = body["tools"] # 支持工具结果 if body.get("tool_choice"): vllm_request["tool_choice"] = body["tool_choice"] # 支持响应格式 if body.get("response_format"): vllm_request["response_format"] = body["response_format"] if body.get("max_tokens"): vllm_request["max_tokens"] = body["max_tokens"] headers = {"Content-Type": "application/json"} if VLLM_API_KEY: headers["Authorization"] = f"Bearer {VLLM_API_KEY}" if stream: log_request("POST", f"{VLLM_URL}/v1/chat/completions", vllm_request, headers) return StreamingResponse( stream_response(vllm_request, headers), media_type="application/x-ndjson" ) else: log_request("POST", f"{VLLM_URL}/v1/chat/completions", vllm_request, headers) resp = await client.post(f"{VLLM_URL}/v1/chat/completions", json=vllm_request, headers=headers) log_response(resp.status_code, resp.json()) if resp.status_code != 200: raise HTTPException(status_code=resp.status_code, detail=resp.text) data = resp.json() choice = data["choices"][0] result = { "model": OLLAMA_NAME, "message": {"role": "assistant", "content": choice["message"]["content"]}, "done": True } # 支持工具调用返回 if choice["message"].get("tool_calls"): result["message"]["tool_calls"] = choice["message"]["tool_calls"] return result async def stream_response(vllm_request, headers): """流式响应""" log_request("POST", f"{VLLM_URL}/v1/chat/completions", vllm_request, headers) async with client.stream("POST", f"{VLLM_URL}/v1/chat/completions", json=vllm_request, headers=headers) as resp: print(f"[{datetime.now().strftime('%H:%M:%S')}] 流式响应状态: {resp.status_code}", flush=True) async for line in resp.aiter_lines(): if line.startswith("data: "): data = line[6:] if data == "[DONE]": yield '{"done":true}\n' break try: chunk = json.loads(data) content = chunk["choices"][0].get("delta", {}).get("content", "") if content: yield json.dumps({"model": OLLAMA_NAME, "message": {"role": "assistant", "content": content}, "done": False}) + "\n" except (json.JSONDecodeError, KeyError): continue @app.post("/api/generate") async def generate(request: Request): """文本生成接口""" body = await request.json() vllm_request = { "model": VLLM_MODEL, "prompt": body.get("prompt", ""), "temperature": body.get("temperature", 1.0), "top_p": body.get("top_p", 1.0), "stream": body.get("stream", False) } headers = {"Content-Type": "application/json"} if VLLM_API_KEY: headers["Authorization"] = f"Bearer {VLLM_API_KEY}" if body.get("stream"): log_request("POST", f"{VLLM_URL}/v1/completions", vllm_request, headers) return StreamingResponse( stream_generate(vllm_request, headers), media_type="application/x-ndjson" ) else: log_request("POST", f"{VLLM_URL}/v1/completions", vllm_request, headers) resp = await client.post(f"{VLLM_URL}/v1/completions", json=vllm_request, headers=headers) log_response(resp.status_code, resp.json()) if resp.status_code != 200: raise HTTPException(status_code=resp.status_code, detail=resp.text) data = resp.json() return { "model": OLLAMA_NAME, "response": data["choices"][0]["text"], "done": True } async def stream_generate(vllm_request, headers): """流式生成""" log_request("POST", f"{VLLM_URL}/v1/completions", vllm_request, headers) async with client.stream("POST", f"{VLLM_URL}/v1/completions", json=vllm_request, headers=headers) as resp: print(f"[{datetime.now().strftime('%H:%M:%S')}] 流式生成状态: {resp.status_code}", flush=True) async for line in resp.aiter_lines(): if line.startswith("data: "): data = line[6:] if data == "[DONE]": yield '{"done":true}\n' break try: chunk = json.loads(data) text = chunk["choices"][0].get("text", "") if text: yield json.dumps({"model": OLLAMA_NAME, "response": text, "done": False}) + "\n" except (json.JSONDecodeError, KeyError): continue @app.post("/api/embed") async def embed(request: Request): """生成文本嵌入向量 - 调用 vLLM""" body = await request.json() inputs = body.get("input", []) if not isinstance(inputs, list): inputs = [inputs] headers = {"Content-Type": "application/json"} if VLLM_API_KEY: headers["Authorization"] = f"Bearer {VLLM_API_KEY}" vllm_body = {"model": VLLM_MODEL, "input": inputs} log_request("POST", f"{VLLM_URL}/v1/embeddings", vllm_body, headers) resp = await client.post( f"{VLLM_URL}/v1/embeddings", json=vllm_body, headers=headers ) log_response(resp.status_code, resp.json()) if resp.status_code != 200: raise HTTPException(status_code=resp.status_code, detail=resp.text) data = resp.json() embeddings = [item.get("embedding", []) for item in data.get("data", [])] return {"embeddings": embeddings} @app.post("/api/embeddings") async def embeddings(request: Request): """旧版嵌入接口 - 调用 vLLM""" body = await request.json() prompt = body.get("prompt", "") headers = {"Content-Type": "application/json"} if VLLM_API_KEY: headers["Authorization"] = f"Bearer {VLLM_API_KEY}" vllm_body = {"model": VLLM_MODEL, "input": prompt} log_request("POST", f"{VLLM_URL}/v1/embeddings", vllm_body, headers) resp = await client.post( f"{VLLM_URL}/v1/embeddings", json=vllm_body, headers=headers ) log_response(resp.status_code, resp.json()) if resp.status_code != 200: raise HTTPException(status_code=resp.status_code, detail=resp.text) data = resp.json() embedding = data.get("data", [{}])[0].get("embedding", []) return {"embedding": embedding} @app.post("/v1/chat/completions") async def v1_chat(request: Request): """OpenAI 格式接口 - 直接透传到 vLLM""" body = await request.json() body["model"] = VLLM_MODEL headers = {"Content-Type": "application/json"} if VLLM_API_KEY: headers["Authorization"] = f"Bearer {VLLM_API_KEY}" if body.get("stream", False): log_request("POST", f"{VLLM_URL}/v1/chat/completions", body, headers) return StreamingResponse( stream_v1_chat(body, headers), media_type="text/event-stream" ) else: log_request("POST", f"{VLLM_URL}/v1/chat/completions", body, headers) resp = await client.post(f"{VLLM_URL}/v1/chat/completions", json=body, headers=headers) log_response(resp.status_code, resp.json()) if resp.status_code != 200: raise HTTPException(status_code=resp.status_code, detail=resp.text) return resp.json() async def stream_v1_chat(body, headers): """OpenAI 格式流式响应""" log_request("POST", f"{VLLM_URL}/v1/chat/completions", body, headers) async with client.stream("POST", f"{VLLM_URL}/v1/chat/completions", json=body, headers=headers) as resp: print(f"[{datetime.now().strftime('%H:%M:%S')}] v1 流式响应状态: {resp.status_code}", flush=True) async for line in resp.aiter_lines(): yield line + "\n" @app.post("/v1/completions") async def v1_completions(request: Request): """OpenAI 格式补全接口 - 直接透传到 vLLM""" body = await request.json() body["model"] = VLLM_MODEL headers = {"Content-Type": "application/json"} if VLLM_API_KEY: headers["Authorization"] = f"Bearer {VLLM_API_KEY}" log_request("POST", f"{VLLM_URL}/v1/completions", body, headers) resp = await client.post(f"{VLLM_URL}/v1/completions", json=body, headers=headers) log_response(resp.status_code, resp.json()) if resp.status_code != 200: raise HTTPException(status_code=resp.status_code, detail=resp.text) return resp.json() @app.get("/v1/models") async def v1_models(): """OpenAI 格式模型列表""" url = f"{VLLM_URL}/v1/models" log_request("GET", url) resp = await client.get(url) log_response(resp.status_code, resp.json()) return resp.json() if __name__ == "__main__": import uvicorn print("启动 Ollama 代理: http://127.0.0.1:11434") print(f"转发到 vLLM: {VLLM_URL}") uvicorn.run(app, host="0.0.0.0", port=11434)

运行以上代码 python main.py

然后回到VS自带模型点击添加

出现实现的模型接口,勾选模型,点击保存

就可以通过模型选择进行指定刚才设置的模型了

版权声明: 本文来自互联网用户投稿,该文观点仅代表作者本人,不代表本站立场。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如若内容造成侵权/违法违规/事实不符,请联系邮箱:809451989@qq.com进行投诉反馈,一经查实,立即删除!
网站建设 2026/6/24 2:59:14

Minecraft世界转换终极指南:如何用Chunker实现跨平台存档共享

Minecraft世界转换终极指南:如何用Chunker实现跨平台存档共享 【免费下载链接】Chunker Convert Minecraft worlds between Java Edition and Bedrock Edition 项目地址: https://gitcode.com/gh_mirrors/chu/Chunker 作为Minecraft玩家,您是否曾…

作者头像 李华
网站建设 2026/6/24 2:55:24

如何高效处理扫描文档:Scan Tailor智能优化完全教程

如何高效处理扫描文档:Scan Tailor智能优化完全教程 【免费下载链接】scantailor 项目地址: https://gitcode.com/gh_mirrors/sc/scantailor 你是否曾经扫描过纸质文档,却发现结果不尽如人意?页面倾斜、双页扫描、边缘阴影、图像模糊…

作者头像 李华
网站建设 2026/6/24 2:47:22

课时3:C 语言输入输出函数:printf 与 scanf 详解

摘要 本文详细介绍了 C 语言中两个最核心的输入输出函数 printf 和 scanf 的用法与注意事项。主要内容包括: printf 格式化输出:从基本用法、转义字符、占位符(%d, %f, %c, %s 等)到高级功能如控制小数位数(%.2f)和宽度对齐(%-10s),并提供了完整的占位符对照表。 sca…

作者头像 李华
网站建设 2026/6/24 2:44:48

芯片编程烧写烧录座,实力厂家全解析

芯片编程烧写烧录座作为芯片测试与验证的关键设备,其市场需求日益增长。在众多厂家中,深圳市谷易电子有限公司(以下简称“谷易电子”)凭借其深厚的技术积累和创新能力,成为了行业的佼佼者。本文将为您详细解析谷易电子…

作者头像 李华
网站建设 2026/6/24 2:43:12

设置目标IP的端口是否开放

Windows 首选 Test-NetConnection (PowerShell) 在 Windows 搜索框输入 PowerShell&#xff0c;打开它。输入以下命令格式&#xff0c;然后按回车&#xff1a;Test-NetConnection <目标IP地址或域名> -Port <端口号>例子&#xff1a;测试能否连接到 192.168.1.10 这…

作者头像 李华
网站建设 2026/6/24 2:41:41

多知识库路由:一个入口先选库再检索

结论先摆&#xff1a;当你有好几个知识库&#xff08;产品库、售后库、政策库……&#xff09;&#xff0c;千万别把用户问题一股脑甩给所有库一起检索。正确做法是在前面加一道"路由"——先判断这个问题该去哪个库&#xff0c;选定了再进那个库检索。我这么改完之后…

作者头像 李华