Skip to content

Commit f7cde17

Browse files
authored
Merge pull request ChatGPTNextWeb#6292 from Little-LittleProgrammer/feature/alibaba-omni-support
feat(alibaba): Added alibaba vision model and omni model support
2 parents 570cbb3 + a2c4e46 commit f7cde17

File tree

4 files changed

+61
-16
lines changed

4 files changed

+61
-16
lines changed

app/client/api.ts

+5
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,11 @@ export interface MultimodalContent {
4040
};
4141
}
4242

43+
export interface MultimodalContentForAlibaba {
44+
text?: string;
45+
image?: string;
46+
}
47+
4348
export interface RequestMessage {
4449
role: MessageRole;
4550
content: string | MultimodalContent[];

app/client/platforms/alibaba.ts

+26-12
Original file line numberDiff line numberDiff line change
@@ -7,20 +7,25 @@ import {
77
ChatMessageTool,
88
usePluginStore,
99
} from "@/app/store";
10-
import { streamWithThink } from "@/app/utils/chat";
10+
import {
11+
preProcessImageContentForAlibabaDashScope,
12+
streamWithThink,
13+
} from "@/app/utils/chat";
1114
import {
1215
ChatOptions,
1316
getHeaders,
1417
LLMApi,
1518
LLMModel,
1619
SpeechOptions,
1720
MultimodalContent,
21+
MultimodalContentForAlibaba,
1822
} from "../api";
1923
import { getClientConfig } from "@/app/config/client";
2024
import {
2125
getMessageTextContent,
2226
getMessageTextContentWithoutThinking,
2327
getTimeoutMSByModel,
28+
isVisionModel,
2429
} from "@/app/utils";
2530
import { fetch } from "@/app/utils/stream";
2631

@@ -89,14 +94,6 @@ export class QwenApi implements LLMApi {
8994
}
9095

9196
async chat(options: ChatOptions) {
92-
const messages = options.messages.map((v) => ({
93-
role: v.role,
94-
content:
95-
v.role === "assistant"
96-
? getMessageTextContentWithoutThinking(v)
97-
: getMessageTextContent(v),
98-
}));
99-
10097
const modelConfig = {
10198
...useAppConfig.getState().modelConfig,
10299
...useChatStore.getState().currentSession().mask.modelConfig,
@@ -105,6 +102,21 @@ export class QwenApi implements LLMApi {
105102
},
106103
};
107104

105+
const visionModel = isVisionModel(options.config.model);
106+
107+
const messages: ChatOptions["messages"] = [];
108+
for (const v of options.messages) {
109+
const content = (
110+
visionModel
111+
? await preProcessImageContentForAlibabaDashScope(v.content)
112+
: v.role === "assistant"
113+
? getMessageTextContentWithoutThinking(v)
114+
: getMessageTextContent(v)
115+
) as any;
116+
117+
messages.push({ role: v.role, content });
118+
}
119+
108120
const shouldStream = !!options.config.stream;
109121
const requestPayload: RequestPayload = {
110122
model: modelConfig.model,
@@ -129,7 +141,7 @@ export class QwenApi implements LLMApi {
129141
"X-DashScope-SSE": shouldStream ? "enable" : "disable",
130142
};
131143

132-
const chatPath = this.path(Alibaba.ChatPath);
144+
const chatPath = this.path(Alibaba.ChatPath(modelConfig.model));
133145
const chatPayload = {
134146
method: "POST",
135147
body: JSON.stringify(requestPayload),
@@ -162,7 +174,7 @@ export class QwenApi implements LLMApi {
162174
const json = JSON.parse(text);
163175
const choices = json.output.choices as Array<{
164176
message: {
165-
content: string | null;
177+
content: string | null | MultimodalContentForAlibaba[];
166178
tool_calls: ChatMessageTool[];
167179
reasoning_content: string | null;
168180
};
@@ -212,7 +224,9 @@ export class QwenApi implements LLMApi {
212224
} else if (content && content.length > 0) {
213225
return {
214226
isThinking: false,
215-
content: content,
227+
content: Array.isArray(content)
228+
? content.map((item) => item.text).join(",")
229+
: content,
216230
};
217231
}
218232

app/constant.ts

+9-1
Original file line numberDiff line numberDiff line change
@@ -221,7 +221,12 @@ export const ByteDance = {
221221

222222
export const Alibaba = {
223223
ExampleEndpoint: ALIBABA_BASE_URL,
224-
ChatPath: "v1/services/aigc/text-generation/generation",
224+
ChatPath: (modelName: string) => {
225+
if (modelName.includes("vl") || modelName.includes("omni")) {
226+
return "v1/services/aigc/multimodal-generation/generation";
227+
}
228+
return `v1/services/aigc/text-generation/generation`;
229+
},
225230
};
226231

227232
export const Tencent = {
@@ -570,6 +575,9 @@ const alibabaModes = [
570575
"qwen-max-0403",
571576
"qwen-max-0107",
572577
"qwen-max-longcontext",
578+
"qwen-omni-turbo",
579+
"qwen-vl-plus",
580+
"qwen-vl-max",
573581
];
574582

575583
const tencentModels = [

app/utils/chat.ts

+21-3
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ import {
33
UPLOAD_URL,
44
REQUEST_TIMEOUT_MS,
55
} from "@/app/constant";
6-
import { RequestMessage } from "@/app/client/api";
6+
import { MultimodalContent, RequestMessage } from "@/app/client/api";
77
import Locale from "@/app/locales";
88
import {
99
EventStreamContentType,
@@ -70,8 +70,9 @@ export function compressImage(file: Blob, maxSize: number): Promise<string> {
7070
});
7171
}
7272

73-
export async function preProcessImageContent(
73+
export async function preProcessImageContentBase(
7474
content: RequestMessage["content"],
75+
transformImageUrl: (url: string) => Promise<{ [key: string]: any }>,
7576
) {
7677
if (typeof content === "string") {
7778
return content;
@@ -81,7 +82,7 @@ export async function preProcessImageContent(
8182
if (part?.type == "image_url" && part?.image_url?.url) {
8283
try {
8384
const url = await cacheImageToBase64Image(part?.image_url?.url);
84-
result.push({ type: part.type, image_url: { url } });
85+
result.push(await transformImageUrl(url));
8586
} catch (error) {
8687
console.error("Error processing image URL:", error);
8788
}
@@ -92,6 +93,23 @@ export async function preProcessImageContent(
9293
return result;
9394
}
9495

96+
export async function preProcessImageContent(
97+
content: RequestMessage["content"],
98+
) {
99+
return preProcessImageContentBase(content, async (url) => ({
100+
type: "image_url",
101+
image_url: { url },
102+
})) as Promise<MultimodalContent[] | string>;
103+
}
104+
105+
export async function preProcessImageContentForAlibabaDashScope(
106+
content: RequestMessage["content"],
107+
) {
108+
return preProcessImageContentBase(content, async (url) => ({
109+
image: url,
110+
}));
111+
}
112+
95113
const imageCaches: Record<string, string> = {};
96114
export function cacheImageToBase64Image(imageUrl: string) {
97115
if (imageUrl.includes(CACHE_URL_PREFIX)) {

0 commit comments

Comments
 (0)