最新版本VisualStudio已经可以接入其他大语言模型了,通过管理模型的接口进入设置
但是他不支持其他的vllm、llama.cpp接口
而且ollama接口也只支持本地lhttp://localhost:11434,其他不支持好像
到以上位置,这个点击添加按钮没啥用,改地址也不行
直接用大模型帮忙写一个脚本,直接做接口转换,把VS请求的localhost接口转到已经部署的vllm接口,其他接口类似就行,但是需要注意的是要实现完整的ollama接口
#!/usr/bin/env python3 """Ollama to vLLM 简单接口转发""" import json import os import sys from datetime import datetime from fastapi import FastAPI, Request, HTTPException from fastapi.responses import StreamingResponse, JSONResponse import httpx # ===== 配置 ===== VLLM_URL = "http://172.16.1.15:8000" VLLM_API_KEY = "" # 如果有 API key 填这里 OLLAMA_NAME = "qwen3.6-27b" # 本地 Ollama 接口显示的模型名 VLLM_MODEL = "/data/ai/models/Qwen3.6-27B" # vLLM 上的模型名 # ================ app = FastAPI() client = httpx.AsyncClient(timeout=300.0) def log_request(method, url, body=None, headers=None): """打印请求日志""" print(f"\n{'='*60}", flush=True) print(f"[{datetime.now().strftime('%H:%M:%S')}] {method} {url}", flush=True) if headers: print(f"Headers: {json.dumps(headers, ensure_ascii=False)}", flush=True) if body: print(f"Body: {json.dumps(body, ensure_ascii=False, indent=2)}", flush=True) print(f"{'='*60}\n", flush=True) def log_response(status_code, data=None): """打印响应日志""" print(f"[{datetime.now().strftime('%H:%M:%S')}] Response: {status_code}", flush=True) if data: print(f"Data: {json.dumps(data, ensure_ascii=False, indent=2)}", flush=True) print(f"{'='*60}\n", flush=True) @app.get("/api/tags") async def get_models(): """获取模型列表 - 直接调用 vLLM""" url = f"{VLLM_URL}/v1/models" log_request("GET", url) resp = await client.get(url) log_response(resp.status_code, resp.json()) if resp.status_code != 200: return {"models": [{"name": OLLAMA_NAME, "model": OLLAMA_NAME}]} data = resp.json() models = [] for m in data.get("data", []): max_len = m.get("max_model_len", 131072) models.append({ "name": OLLAMA_NAME, "model": OLLAMA_NAME, "modified_at": datetime.utcfromtimestamp(m.get("created", 0)).strftime("%Y-%m-%dT%H:%M:%S") + "+00:00", "size": 0, "digest": "", "details": { "parent_model": m.get("parent", "") or "", "format": "vllm", "family": "qwen3", "families": ["qwen3"], "parameter_size": "27B", "quantization_level": "FP16", "context_length": max_len, "embedding_length": 5120 }, "capabilities": ["vision", "completion", "tools"] }) return {"models": models if models else [{"name": OLLAMA_NAME, "model": OLLAMA_NAME}]} @app.get("/api/ps") async def running_models(): """列出正在运行的模型""" return {"models": [{"name": OLLAMA_NAME, "model": OLLAMA_NAME}]} @app.post("/api/show") async def show_model(request: Request): """Ollama 模型信息接口 - 直接调用 vLLM""" url = f"{VLLM_URL}/v1/models" log_request("GET", url) resp = await client.get(url) log_response(resp.status_code, resp.json()) if resp.status_code != 200: return {"license": "", "modelfile": "", "parameters": ""} data = resp.json() model_info = data.get("data", [{}])[0] max_len = model_info.get("max_model_len", 131072) # 构建参数字符串 params = [ f"max_model_len: {max_len}", f"created: {model_info.get('created', '')}", f"owned_by: {model_info.get('owned_by', '')}" ] return { "license": "", "modelfile": f"FROM {VLLM_MODEL}\n\nPARAMETER max_model_len {max_len}\n\nSYSTEM \"\"\"You are a helpful assistant.\"\"\"", "parameters": "\n".join(params), "details": { "parent_model": model_info.get("parent", "") or "", "format": "vllm", "family": "qwen3", "families": ["qwen3"], "parameter_size": "27B", "quantization_level": "FP16", "context_length": max_len, "embedding_length": 5120 }, "capabilities": ["vision", "completion", "tools"] } @app.post("/api/chat") async def chat(request: Request): """聊天接口 - Ollama 格式转 vLLM 格式,支持工具调用和图像理解""" body = await request.json() messages = body.get("messages", []) stream = body.get("stream", False) vllm_request = { "model": VLLM_MODEL, "messages": messages, "temperature": body.get("temperature", 1.0), "top_p": body.get("top_p", 1.0), "stream": stream } # 支持工具调用 if body.get("tools"): vllm_request["tools"] = body["tools"] # 支持工具结果 if body.get("tool_choice"): vllm_request["tool_choice"] = body["tool_choice"] # 支持响应格式 if body.get("response_format"): vllm_request["response_format"] = body["response_format"] if body.get("max_tokens"): vllm_request["max_tokens"] = body["max_tokens"] headers = {"Content-Type": "application/json"} if VLLM_API_KEY: headers["Authorization"] = f"Bearer {VLLM_API_KEY}" if stream: log_request("POST", f"{VLLM_URL}/v1/chat/completions", vllm_request, headers) return StreamingResponse( stream_response(vllm_request, headers), media_type="application/x-ndjson" ) else: log_request("POST", f"{VLLM_URL}/v1/chat/completions", vllm_request, headers) resp = await client.post(f"{VLLM_URL}/v1/chat/completions", json=vllm_request, headers=headers) log_response(resp.status_code, resp.json()) if resp.status_code != 200: raise HTTPException(status_code=resp.status_code, detail=resp.text) data = resp.json() choice = data["choices"][0] result = { "model": OLLAMA_NAME, "message": {"role": "assistant", "content": choice["message"]["content"]}, "done": True } # 支持工具调用返回 if choice["message"].get("tool_calls"): result["message"]["tool_calls"] = choice["message"]["tool_calls"] return result async def stream_response(vllm_request, headers): """流式响应""" log_request("POST", f"{VLLM_URL}/v1/chat/completions", vllm_request, headers) async with client.stream("POST", f"{VLLM_URL}/v1/chat/completions", json=vllm_request, headers=headers) as resp: print(f"[{datetime.now().strftime('%H:%M:%S')}] 流式响应状态: {resp.status_code}", flush=True) async for line in resp.aiter_lines(): if line.startswith("data: "): data = line[6:] if data == "[DONE]": yield '{"done":true}\n' break try: chunk = json.loads(data) content = chunk["choices"][0].get("delta", {}).get("content", "") if content: yield json.dumps({"model": OLLAMA_NAME, "message": {"role": "assistant", "content": content}, "done": False}) + "\n" except (json.JSONDecodeError, KeyError): continue @app.post("/api/generate") async def generate(request: Request): """文本生成接口""" body = await request.json() vllm_request = { "model": VLLM_MODEL, "prompt": body.get("prompt", ""), "temperature": body.get("temperature", 1.0), "top_p": body.get("top_p", 1.0), "stream": body.get("stream", False) } headers = {"Content-Type": "application/json"} if VLLM_API_KEY: headers["Authorization"] = f"Bearer {VLLM_API_KEY}" if body.get("stream"): log_request("POST", f"{VLLM_URL}/v1/completions", vllm_request, headers) return StreamingResponse( stream_generate(vllm_request, headers), media_type="application/x-ndjson" ) else: log_request("POST", f"{VLLM_URL}/v1/completions", vllm_request, headers) resp = await client.post(f"{VLLM_URL}/v1/completions", json=vllm_request, headers=headers) log_response(resp.status_code, resp.json()) if resp.status_code != 200: raise HTTPException(status_code=resp.status_code, detail=resp.text) data = resp.json() return { "model": OLLAMA_NAME, "response": data["choices"][0]["text"], "done": True } async def stream_generate(vllm_request, headers): """流式生成""" log_request("POST", f"{VLLM_URL}/v1/completions", vllm_request, headers) async with client.stream("POST", f"{VLLM_URL}/v1/completions", json=vllm_request, headers=headers) as resp: print(f"[{datetime.now().strftime('%H:%M:%S')}] 流式生成状态: {resp.status_code}", flush=True) async for line in resp.aiter_lines(): if line.startswith("data: "): data = line[6:] if data == "[DONE]": yield '{"done":true}\n' break try: chunk = json.loads(data) text = chunk["choices"][0].get("text", "") if text: yield json.dumps({"model": OLLAMA_NAME, "response": text, "done": False}) + "\n" except (json.JSONDecodeError, KeyError): continue @app.post("/api/embed") async def embed(request: Request): """生成文本嵌入向量 - 调用 vLLM""" body = await request.json() inputs = body.get("input", []) if not isinstance(inputs, list): inputs = [inputs] headers = {"Content-Type": "application/json"} if VLLM_API_KEY: headers["Authorization"] = f"Bearer {VLLM_API_KEY}" vllm_body = {"model": VLLM_MODEL, "input": inputs} log_request("POST", f"{VLLM_URL}/v1/embeddings", vllm_body, headers) resp = await client.post( f"{VLLM_URL}/v1/embeddings", json=vllm_body, headers=headers ) log_response(resp.status_code, resp.json()) if resp.status_code != 200: raise HTTPException(status_code=resp.status_code, detail=resp.text) data = resp.json() embeddings = [item.get("embedding", []) for item in data.get("data", [])] return {"embeddings": embeddings} @app.post("/api/embeddings") async def embeddings(request: Request): """旧版嵌入接口 - 调用 vLLM""" body = await request.json() prompt = body.get("prompt", "") headers = {"Content-Type": "application/json"} if VLLM_API_KEY: headers["Authorization"] = f"Bearer {VLLM_API_KEY}" vllm_body = {"model": VLLM_MODEL, "input": prompt} log_request("POST", f"{VLLM_URL}/v1/embeddings", vllm_body, headers) resp = await client.post( f"{VLLM_URL}/v1/embeddings", json=vllm_body, headers=headers ) log_response(resp.status_code, resp.json()) if resp.status_code != 200: raise HTTPException(status_code=resp.status_code, detail=resp.text) data = resp.json() embedding = data.get("data", [{}])[0].get("embedding", []) return {"embedding": embedding} @app.post("/v1/chat/completions") async def v1_chat(request: Request): """OpenAI 格式接口 - 直接透传到 vLLM""" body = await request.json() body["model"] = VLLM_MODEL headers = {"Content-Type": "application/json"} if VLLM_API_KEY: headers["Authorization"] = f"Bearer {VLLM_API_KEY}" if body.get("stream", False): log_request("POST", f"{VLLM_URL}/v1/chat/completions", body, headers) return StreamingResponse( stream_v1_chat(body, headers), media_type="text/event-stream" ) else: log_request("POST", f"{VLLM_URL}/v1/chat/completions", body, headers) resp = await client.post(f"{VLLM_URL}/v1/chat/completions", json=body, headers=headers) log_response(resp.status_code, resp.json()) if resp.status_code != 200: raise HTTPException(status_code=resp.status_code, detail=resp.text) return resp.json() async def stream_v1_chat(body, headers): """OpenAI 格式流式响应""" log_request("POST", f"{VLLM_URL}/v1/chat/completions", body, headers) async with client.stream("POST", f"{VLLM_URL}/v1/chat/completions", json=body, headers=headers) as resp: print(f"[{datetime.now().strftime('%H:%M:%S')}] v1 流式响应状态: {resp.status_code}", flush=True) async for line in resp.aiter_lines(): yield line + "\n" @app.post("/v1/completions") async def v1_completions(request: Request): """OpenAI 格式补全接口 - 直接透传到 vLLM""" body = await request.json() body["model"] = VLLM_MODEL headers = {"Content-Type": "application/json"} if VLLM_API_KEY: headers["Authorization"] = f"Bearer {VLLM_API_KEY}" log_request("POST", f"{VLLM_URL}/v1/completions", body, headers) resp = await client.post(f"{VLLM_URL}/v1/completions", json=body, headers=headers) log_response(resp.status_code, resp.json()) if resp.status_code != 200: raise HTTPException(status_code=resp.status_code, detail=resp.text) return resp.json() @app.get("/v1/models") async def v1_models(): """OpenAI 格式模型列表""" url = f"{VLLM_URL}/v1/models" log_request("GET", url) resp = await client.get(url) log_response(resp.status_code, resp.json()) return resp.json() if __name__ == "__main__": import uvicorn print("启动 Ollama 代理: http://127.0.0.1:11434") print(f"转发到 vLLM: {VLLM_URL}") uvicorn.run(app, host="0.0.0.0", port=11434)运行以上代码 python main.py
然后回到VS自带模型点击添加
出现实现的模型接口,勾选模型,点击保存
就可以通过模型选择进行指定刚才设置的模型了