import { S3Client, S3File } from "bun" import { JSDOM } from "jsdom" import initAppConfig, { APP_CONFIG, ConfigModel } from "../constant/config" import pbClient from "../db/pbClient" import { Context } from "../types" import { genContextManually } from "../utils/genContext" import llm from "../utils/llm" await initAppConfig() /** * 批量翻译文本列表 * @param textList - 待翻译的文本列表 * @param requestId - 请求ID * @returns 翻译后的文本映射 */ const translateTextList = async (textList: string[], requestId: string) => { const translatedTexts = await llm.invoke( "batchTranslate", { inputArray: textList, targetLang: "简体中文", }, requestId, 1, true ) const translatedList = JSON.parse(translatedTexts as string) as string[] const translatedMap: Record = {} textList.forEach((text, index) => { translatedMap[text] = translatedList[index] }) return translatedMap } /** * 过滤不需要翻译的字符串 * @param str - 待处理的字符串 * @returns 是否需要翻译 */ const getIsTranslatableString = (str: string): boolean => { const trimmed: string = str.trim() // 1. 排除空字符串或纯空白字符 if (trimmed === "") return false // 2. 排除纯Emoji字符(包括复合Emoji如👨👩👧👦) const isEmojiOnly: boolean = /^\p{Emoji}+$/u.test(trimmed) if (isEmojiOnly) return false // 3. 排除无字母/文字字符(允许包含数字但必须存在文字) const hasLetters: boolean = /\p{L}/u.test(trimmed) // 匹配任意语言字母/文字 return hasLetters } /** * 获取新的页面内容 * @param newId - 页面ID * @returns 页面内容的HTML字符串 */ const getNewPage = async (newId: number) => { const url = `https://bytes.dev/archives/${newId}` const res = await fetch(url) if (!res.ok) { return "" } const htmlContent = await res.text() // 替换相对路径为绝对路径 const replacedContent = htmlContent.replace( /(href|src)="\/(?!\/)/g, `$1="https://bytes.dev/` ) return replacedContent } /** * 获取页面内容并翻译 * @param ctx - 上下文对象 * @param rawHtml - 原始HTML内容 * @returns 翻译后的HTML内容 */ const translateHTML = async (ctx: Context, rawHtml: string) => { const { logger } = ctx const dom = new JSDOM(rawHtml) const document = dom.window.document // 移除 __NEXT_DATA__ 脚本 const nextDataEle = document.querySelector( "script#__NEXT_DATA__" ) as HTMLScriptElement nextDataEle.remove() // 需要翻译的元素列表(扩展selector需在此添加) const targetNodes = Array.from( document.querySelectorAll( "p:not(code *), h1:not(code *), h2:not(code *), h3:not(code *), h4:not(code *), h5:not(code *), h6:not(code *), span:not(code *), a:not(code *), li:not(code *), td, th:not(code *), caption:not(code *), button:not(code *), label:not(code *), title:not(code *)" ) ) const needTranslateText: string[] = [] // 提取需要翻译的文本 targetNodes.forEach((ele) => { const textNodes = Array.from(ele.childNodes).filter( (node) => node.nodeType === 3 ) textNodes.forEach((textNode) => { const text = textNode.textContent?.trim() if (text && getIsTranslatableString(text)) { needTranslateText.push(text) } }) }) const uniqueTexts = [...new Set(needTranslateText)] logger.debug(`uniqueTexts: ${uniqueTexts}`) // 翻译,拆分为多个请求以避免超长 const chunkSize = 20 const chunks: string[][] = [] for (let i = 0; i < uniqueTexts.length; i += chunkSize) { chunks.push(uniqueTexts.slice(i, i + chunkSize)) } const reqList = [] for (const chunk of chunks) { reqList.push(translateTextList(chunk, "translateHTML")) } const translatedMaps = await Promise.all(reqList) const mergedMap: Record = {} translatedMaps.forEach((translatedMap) => { Object.assign(mergedMap, translatedMap) }) // 更新页面内容 targetNodes.forEach((ele) => { const textNodes = Array.from(ele.childNodes).filter( (node) => node.nodeType === 3 ) textNodes.forEach((textNode) => { const text = textNode.textContent?.trim() if (text) { const translated = mergedMap[text] if (translated) { textNode.textContent = translated } } }) }) // 将所有 标签的点击改成在新页面打开链接 const anchorNodes = document.querySelectorAll("a") anchorNodes.forEach((anchor) => { anchor.setAttribute("target", "_blank") }) return dom.serialize() } /** * 获取最新的页面ID * @returns 最新的页面ID */ const getLatestId = async () => { const current = await pbClient .collection("env") .getOne("5l8a8u85p5v4aid") return current.value } /** * 设置最新的页面ID * @param id - 页面ID */ const setLatestId = async (id: number) => { await pbClient .collection("env") .update("5l8a8u85p5v4aid", { value: id }) } /** * 写入HTML内容到文件 * @param html - HTML内容 */ const writeHtml = async (html: string, version: number) => { const client = new S3Client({ accessKeyId: APP_CONFIG.S3_MICHAT_AK, secretAccessKey: APP_CONFIG.S3_MICHAT_SK, region: "cnbj1", endpoint: "https://s3-cnbj1.mi-fds.net", bucket: "mi-chat-fe", }) const s3file: S3File = client.file(`bytes/${version}.html`) await s3file.write(html) } /** * 监控并翻译最新的页面内容 */ const byteMonitor = async () => { const ctx = await genContextManually() const { logger, larkService, appInfo } = ctx logger.info("byteMonitor start") try { const latestId = await getLatestId() if (!latestId) throw new Error("getLatestId empty") const newId = Number(latestId) + 1 const newPage = await getNewPage(newId) if (!newPage) throw new Error("getNewPage empty") const translatedPage = await translateHTML(ctx, newPage) await writeHtml(translatedPage, newId) await setLatestId(newId) await larkService.message.sendText2Chat( appInfo.errChatId, `页面链接:https://mi-chat-fe.cnbj1.mi-fds.com/mi-chat-fe/bytes/${newId}.html`, "byteMonitor 更新" ) } catch (error) { const errorMessage = `byteMonitor error: ${error}` logger.error(errorMessage) } finally { logger.info("byteMonitor finished") } } export default byteMonitor