diff --git a/.vscode/settings.json b/.vscode/settings.json index ce81483..18437d7 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -6,6 +6,7 @@ "CEINTL", "Chakroun", "CICD", + "cnbj", "commitlint", "dbaeumer", "deepseek", diff --git a/bun.lockb b/bun.lockb index e0497f3..03dbd99 100755 Binary files a/bun.lockb and b/bun.lockb differ diff --git a/package.json b/package.json index 0b009dd..674e18b 100644 --- a/package.json +++ b/package.json @@ -20,6 +20,7 @@ "@commitlint/cli": "^19.7.1", "@commitlint/config-conventional": "^19.7.1", "@eslint/js": "^9.19.0", + "@types/jsdom": "^21.1.7", "@types/node-schedule": "^2.1.7", "@types/uuid": "^10.0.0", "bun-types": "^1.2.2", @@ -45,6 +46,7 @@ "@langchain/langgraph": "^0.2.44", "@langchain/openai": "^0.3.17", "joi": "^17.13.3", + "jsdom": "^26.0.0", "langfuse-langchain": "^3.35.1", "node-schedule": "^2.1.1", "p-limit": "^6.2.0", diff --git a/schedule/byteMonitor.ts b/schedule/byteMonitor.ts new file mode 100644 index 0000000..b05ae98 --- /dev/null +++ b/schedule/byteMonitor.ts @@ -0,0 +1,237 @@ +import { S3Client, S3File } from "bun" +import { JSDOM } from "jsdom" + +import initAppConfig, { APP_CONFIG, ConfigModel } from "../constant/config" +import pbClient from "../db/pbClient" +import { Context } from "../types" +import { genContextManually } from "../utils/genContext" +import llm from "../utils/llm" + +await initAppConfig() + +/** + * 批量翻译文本列表 + * @param textList - 待翻译的文本列表 + * @param requestId - 请求ID + * @returns 翻译后的文本映射 + */ +const translateTextList = async (textList: string[], requestId: string) => { + const translatedTexts = await llm.invoke( + "batchTranslate", + { + inputArray: textList, + targetLang: "简体中文", + }, + requestId, + 1, + true + ) + + const translatedList = JSON.parse(translatedTexts as string) as string[] + + const translatedMap: Record = {} + textList.forEach((text, index) => { + translatedMap[text] = translatedList[index] + }) + + return translatedMap +} + +/** + * 过滤不需要翻译的字符串 + * @param str - 待处理的字符串 + * @returns 是否需要翻译 + */ +const getIsTranslatableString = (str: string): boolean => { + const trimmed: string = str.trim() + + // 1. 排除空字符串或纯空白字符 + if (trimmed === "") return false + + // 2. 排除纯Emoji字符(包括复合Emoji如👨👩👧👦) + const isEmojiOnly: boolean = /^\p{Emoji}+$/u.test(trimmed) + if (isEmojiOnly) return false + + // 3. 排除无字母/文字字符(允许包含数字但必须存在文字) + const hasLetters: boolean = /\p{L}/u.test(trimmed) // 匹配任意语言字母/文字 + return hasLetters +} + +/** + * 获取新的页面内容 + * @param newId - 页面ID + * @returns 页面内容的HTML字符串 + */ +const getNewPage = async (newId: number) => { + const url = `https://bytes.dev/archives/${newId}` + const res = await fetch(url) + + if (!res.ok) { + return "" + } + + const htmlContent = await res.text() + + // 替换相对路径为绝对路径 + const replacedContent = htmlContent.replace( + /(href|src)="\/(?!\/)/g, + `$1="https://bytes.dev/` + ) + + return replacedContent +} + +/** + * 获取页面内容并翻译 + * @param ctx - 上下文对象 + * @param rawHtml - 原始HTML内容 + * @returns 翻译后的HTML内容 + */ +const translateHTML = async (ctx: Context, rawHtml: string) => { + const { logger } = ctx + const dom = new JSDOM(rawHtml) + const document = dom.window.document + + // 移除 __NEXT_DATA__ 脚本 + const nextDataEle = document.querySelector( + "script#__NEXT_DATA__" + ) as HTMLScriptElement + nextDataEle.remove() + + // 需要翻译的元素列表(扩展selector需在此添加) + const targetNodes = Array.from( + document.querySelectorAll( + "p:not(code *), h1:not(code *), h2:not(code *), h3:not(code *), h4:not(code *), h5:not(code *), h6:not(code *), span:not(code *), a:not(code *), li:not(code *), td, th:not(code *), caption:not(code *), button:not(code *), label:not(code *), title:not(code *)" + ) + ) + + const needTranslateText: string[] = [] + + // 提取需要翻译的文本 + targetNodes.forEach((ele) => { + const textNodes = Array.from(ele.childNodes).filter( + (node) => node.nodeType === 3 + ) + textNodes.forEach((textNode) => { + const text = textNode.textContent?.trim() + if (text && getIsTranslatableString(text)) { + needTranslateText.push(text) + } + }) + }) + + const uniqueTexts = [...new Set(needTranslateText)] + + logger.debug(`uniqueTexts: ${uniqueTexts}`) + + // 翻译,拆分为多个请求以避免超长 + const chunkSize = 20 + const chunks: string[][] = [] + for (let i = 0; i < uniqueTexts.length; i += chunkSize) { + chunks.push(uniqueTexts.slice(i, i + chunkSize)) + } + const reqList = [] + for (const chunk of chunks) { + reqList.push(translateTextList(chunk, "translateHTML")) + } + + const translatedMaps = await Promise.all(reqList) + + const mergedMap: Record = {} + translatedMaps.forEach((translatedMap) => { + Object.assign(mergedMap, translatedMap) + }) + + // 更新页面内容 + targetNodes.forEach((ele) => { + const textNodes = Array.from(ele.childNodes).filter( + (node) => node.nodeType === 3 + ) + textNodes.forEach((textNode) => { + const text = textNode.textContent?.trim() + if (text) { + const translated = mergedMap[text] + if (translated) { + textNode.textContent = translated + } + } + }) + }) + + // 将所有 标签的点击改成在新页面打开链接 + const anchorNodes = document.querySelectorAll("a") + anchorNodes.forEach((anchor) => { + anchor.setAttribute("target", "_blank") + }) + + return dom.serialize() +} + +/** + * 获取最新的页面ID + * @returns 最新的页面ID + */ +const getLatestId = async () => { + const current = await pbClient + .collection("env") + .getOne("5l8a8u85p5v4aid") + return current.value +} + +/** + * 设置最新的页面ID + * @param id - 页面ID + */ +const setLatestId = async (id: number) => { + await pbClient + .collection("env") + .update("5l8a8u85p5v4aid", { value: id }) +} + +/** + * 写入HTML内容到文件 + * @param html - HTML内容 + */ +const writeHtml = async (html: string, version: number) => { + const client = new S3Client({ + accessKeyId: APP_CONFIG.S3_MICHAT_AK, + secretAccessKey: APP_CONFIG.S3_MICHAT_SK, + region: "cnbj1", + endpoint: "https://s3-cnbj1.mi-fds.net", + bucket: "mi-chat-fe", + }) + + const s3file: S3File = client.file(`bytes/${version}.html`) + await s3file.write(html) +} + +/** + * 监控并翻译最新的页面内容 + */ +const byteMonitor = async () => { + const ctx = await genContextManually() + const { logger, larkService, appInfo } = ctx + logger.info("byteMonitor start") + try { + const latestId = await getLatestId() + if (!latestId) throw new Error("getLatestId empty") + const newId = Number(latestId) + 1 + const newPage = await getNewPage(newId) + if (!newPage) throw new Error("getNewPage empty") + const translatedPage = await translateHTML(ctx, newPage) + await writeHtml(translatedPage, newId) + await setLatestId(newId) + await larkService.message.sendText2Chat( + appInfo.errChatId, + `页面链接:https://mi-chat-fe.cnbj1.mi-fds.com/mi-chat-fe/bytes/${newId}.html`, + "byteMonitor 更新" + ) + } catch (error) { + const errorMessage = `byteMonitor error: ${error}` + logger.error(errorMessage) + } finally { + logger.info("byteMonitor finished") + } +} + +export default byteMonitor diff --git a/schedule/index.ts b/schedule/index.ts index abcaa43..2aa8f20 100644 --- a/schedule/index.ts +++ b/schedule/index.ts @@ -2,6 +2,7 @@ import schedule from "node-schedule" import report from "../controller/groupAgent/report" import { loginPbClient } from "../db/pbClient" +import byteMonitor from "./byteMonitor" import fmMonitor from "./fmMonitor" import sendZhongNotify from "./zhongNotify" @@ -20,4 +21,7 @@ export const initSchedule = async () => { // 定时任务,每小时执行一次 schedule.scheduleJob("0 * * * *", loginPbClient) + + // 定时任务,每小时执行一次 + schedule.scheduleJob("0 * * * *", byteMonitor) }