视觉理解模型

部分大模型具备视觉理解能力，当您传入图片/视频时，大模型可以理解图片/视频里的视觉信息，并结合这些信息完成如描述图片、文字识别、内容创作等任务。通过这篇教程，您可以了解如何通过调用大模型 API 来识别传入图片/视频里的信息。

1.模型系列

Qwen2.5-VL系列

Qwen2.5-VL系列	上下文长度	QPM	TPM	Tokens计费
Qwen2.5-VL-72B-Instruct	128K	120	50w	输入：￥16/M Tokens 输出：￥48/M Tokens
Qwen2.5-VL-32B-Instruct	128K	60	10w	输入：￥8/M Tokens 输出：￥24/M Tokens
Qwen2.5-VL-7B-Instruct	128K	1200	100w	输入：￥2/M Tokens 输出：￥5/M Tokens
Qwen2.5-VL-3B-Instruct	128K	1200	100w	输入：￥1.2/M Tokens 输出：￥3.6/M Tokens

GLM-4V系列

GLM系列	上下文长度	并发数	Tokens计费
GLM-4V-Flash	4K	30	免费
GLM-4V	8K	30	输入：￥50/M Tokens 输出：￥50/M Tokens
GLM-4V-Plus	8K	10	输入：￥4/M Tokens 输出：￥4/M Tokens

ERNIE-4.5-Turbo 系列

ERNIE-4.5-Turbo 系列	上下文长度	RPM	TPM	Tokens计费
ERNIE-4.5-Turbo-32K	32K	1k	20w	输入：¥4.29/M Tokens 输出：¥12.86/M Tokens

2.使用前提

您已创建大模型平台API_Key，用于模型调用。

若您还未申请，请前往 AI 智算云平台-大模型平台-模型广场

3.API接入方式

视觉理解模型支持本地客户端接入、代码接入两种形式

3.1 本地客户端接入

支持三种主流工具：Chatbox、Cherry Studio、AnythingLLM

3.2 代码接入

支持 curl、python、golang、java、nodejs 等

package main
 
import (
    "bytes"
    "encoding/json"
    "fmt"
    "log"
    "net/http"
    "os"
)
 
// 定义请求结构体
type ChatRequest struct {
    Model    string    `json:"model"`
    Messages []Message `json:"messages"`
}
 
type Message struct {
    Role    string    `json:"role"`
    Content []Content `json:"content"`
}
 
type Content struct {
    Type     string    `json:"type"`
    ImageURL *ImageURL `json:"image_url,omitempty"`
    Text     string    `json:"text,omitempty"`
}
 
type ImageURL struct {
    URL string `json:"url"`
}
 
func main() {
    // 配置参数（从环境变量获取更安全）
    apiKey := "申请到的key"
    modelID := "模型ID"
    imageUrl := "图片url或base64编码"
 
    // 构建请求体
    reqBody := ChatRequest{
        Model: modelID,
        Messages: []Message{
            {
                Role: "user",
                Content: []Content{
                    {
                        Type: "image_url",
                        ImageURL: &ImageURL{
                            URL: imageUrl,
                        },
                    },
                    {
                        Type: "text",
                        Text: "图里有什么",
                    },
                },
            },
        },
    }
 
    // 序列化JSON
    jsonData, err := json.Marshal(reqBody)
    if err != nil {
        log.Fatalf("JSON序列化失败: %v", err)
    }
 
    // 创建HTTP请求
    req, err := http.NewRequest(
        "POST",
        "https://$BASE_URL/v1/chat/completions",
        bytes.NewBuffer(jsonData),
    )
    if err != nil {
        log.Fatalf("创建请求失败: %v", err)
    }
 
    // 设置请求头
    req.Header.Set("Authorization", "Bearer "+apiKey)
    req.Header.Set("Content-Type", "application/json")
 
    // 发送请求
    client := &http.Client{}
    resp, err := client.Do(req)
    if err != nil {
        log.Fatalf("请求发送失败: %v", err)
    }
    defer resp.Body.Close()
 
    // 处理响应
    if resp.StatusCode != http.StatusOK {
        log.Printf("请求失败，状态码: %d", resp.StatusCode)
        io.Copy(os.Stderr, resp.Body) // 输出错误信息
        return
    }
 
    // 解析响应
    var result map[string]interface{}
    if err := json.NewDecoder(resp.Body).Decode(&result); err != nil {
        log.Fatalf("响应解析失败: %v", err)
    }
 
    // 格式化输出结果
    prettyResult, _ := json.MarshalIndent(result, "", "  ")
    fmt.Println("响应结果:")
    fmt.Println(string(prettyResult))
}

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStream;
import java.net.HttpURLConnection;
import java.net.URL;
 
public class ParaLLMRequest {
    public static void main(String[] args) {
        try {
            // 1. 创建URL对象
            URL url = new URL("https://$BASE_URL/v1/chat/completions");
            HttpURLConnection connection = (HttpURLConnection) url.openConnection();
            
            // 2. 设置请求方法和头部
            connection.setRequestMethod("POST");
            connection.setRequestProperty("Content-Type", "application/json");
            connection.setRequestProperty("Authorization", "Bearer 申请到的key");
            connection.setDoOutput(true);
            
            // 3. 构建请求体JSON
            String jsonInputString = "{"
                + "\"model\": \"模型ID\","
                + "\"messages\": [{"
                + "    \"role\": \"user\","
                + "    \"content\": ["
                + "        {"
                + "            \"type\": \"image_url\","
                + "            \"image_url\": {"
                + "                \"url\": \"图片url或base64编码\""
                + "            }"
                + "        },"
                + "        {"
                + "            \"type\": \"text\","
                + "            \"text\": \"图里有什么\""
                + "        }"
                + "    ]"
                + "}]}";
            
            // 4. 发送请求
            try(OutputStream os = connection.getOutputStream()) {
                byte[] input = jsonInputString.getBytes("utf-8");
                os.write(input, 0, input.length);
            }
            
            // 5. 处理响应
            int responseCode = connection.getResponseCode();
            if (responseCode == HttpURLConnection.HTTP_OK) {
                try(BufferedReader br = new BufferedReader(
                    new InputStreamReader(connection.getInputStream(), "utf-8"))) {
                    StringBuilder response = new StringBuilder();
                    String responseLine;
                    while ((responseLine = br.readLine()) != null) {
                        response.append(responseLine.trim());
                    }
                    System.out.println(response.toString());
                }
            } else {
                System.out.println("请求失败，状态码: " + responseCode);
            }
            
        } catch (IOException e) {
            e.printStackTrace();
        }
    }
}

文本对话图像生成