238 lines
6.3 KiB
TypeScript
238 lines
6.3 KiB
TypeScript
import { S3Client, S3File } from "bun"
|
||
import { JSDOM } from "jsdom"
|
||
|
||
import initAppConfig, { APP_CONFIG, ConfigModel } from "../constant/config"
|
||
import pbClient from "../db/pbClient"
|
||
import { Context } from "../types"
|
||
import { genContextManually } from "../utils/genContext"
|
||
import llm from "../utils/llm"
|
||
|
||
await initAppConfig()
|
||
|
||
/**
|
||
* 批量翻译文本列表
|
||
* @param textList - 待翻译的文本列表
|
||
* @param requestId - 请求ID
|
||
* @returns 翻译后的文本映射
|
||
*/
|
||
const translateTextList = async (textList: string[], requestId: string) => {
|
||
const translatedTexts = await llm.invoke(
|
||
"batchTranslate",
|
||
{
|
||
inputArray: textList,
|
||
targetLang: "简体中文",
|
||
},
|
||
requestId,
|
||
1,
|
||
true
|
||
)
|
||
|
||
const translatedList = JSON.parse(translatedTexts as string) as string[]
|
||
|
||
const translatedMap: Record<string, string> = {}
|
||
textList.forEach((text, index) => {
|
||
translatedMap[text] = translatedList[index]
|
||
})
|
||
|
||
return translatedMap
|
||
}
|
||
|
||
/**
|
||
* 过滤不需要翻译的字符串
|
||
* @param str - 待处理的字符串
|
||
* @returns 是否需要翻译
|
||
*/
|
||
const getIsTranslatableString = (str: string): boolean => {
|
||
const trimmed: string = str.trim()
|
||
|
||
// 1. 排除空字符串或纯空白字符
|
||
if (trimmed === "") return false
|
||
|
||
// 2. 排除纯Emoji字符(包括复合Emoji如👨👩👧👦)
|
||
const isEmojiOnly: boolean = /^\p{Emoji}+$/u.test(trimmed)
|
||
if (isEmojiOnly) return false
|
||
|
||
// 3. 排除无字母/文字字符(允许包含数字但必须存在文字)
|
||
const hasLetters: boolean = /\p{L}/u.test(trimmed) // 匹配任意语言字母/文字
|
||
return hasLetters
|
||
}
|
||
|
||
/**
|
||
* 获取新的页面内容
|
||
* @param newId - 页面ID
|
||
* @returns 页面内容的HTML字符串
|
||
*/
|
||
const getNewPage = async (newId: number) => {
|
||
const url = `https://bytes.dev/archives/${newId}`
|
||
const res = await fetch(url)
|
||
|
||
if (!res.ok) {
|
||
return ""
|
||
}
|
||
|
||
const htmlContent = await res.text()
|
||
|
||
// 替换相对路径为绝对路径
|
||
const replacedContent = htmlContent.replace(
|
||
/(href|src)="\/(?!\/)/g,
|
||
`$1="https://bytes.dev/`
|
||
)
|
||
|
||
return replacedContent
|
||
}
|
||
|
||
/**
|
||
* 获取页面内容并翻译
|
||
* @param ctx - 上下文对象
|
||
* @param rawHtml - 原始HTML内容
|
||
* @returns 翻译后的HTML内容
|
||
*/
|
||
const translateHTML = async (ctx: Context, rawHtml: string) => {
|
||
const { logger } = ctx
|
||
const dom = new JSDOM(rawHtml)
|
||
const document = dom.window.document
|
||
|
||
// 移除 __NEXT_DATA__ 脚本
|
||
const nextDataEle = document.querySelector(
|
||
"script#__NEXT_DATA__"
|
||
) as HTMLScriptElement
|
||
nextDataEle.remove()
|
||
|
||
// 需要翻译的元素列表(扩展selector需在此添加)
|
||
const targetNodes = Array.from(
|
||
document.querySelectorAll(
|
||
"p:not(code *), h1:not(code *), h2:not(code *), h3:not(code *), h4:not(code *), h5:not(code *), h6:not(code *), span:not(code *), a:not(code *), li:not(code *), td, th:not(code *), caption:not(code *), button:not(code *), label:not(code *), title:not(code *)"
|
||
)
|
||
)
|
||
|
||
const needTranslateText: string[] = []
|
||
|
||
// 提取需要翻译的文本
|
||
targetNodes.forEach((ele) => {
|
||
const textNodes = Array.from(ele.childNodes).filter(
|
||
(node) => node.nodeType === 3
|
||
)
|
||
textNodes.forEach((textNode) => {
|
||
const text = textNode.textContent?.trim()
|
||
if (text && getIsTranslatableString(text)) {
|
||
needTranslateText.push(text)
|
||
}
|
||
})
|
||
})
|
||
|
||
const uniqueTexts = [...new Set(needTranslateText)]
|
||
|
||
logger.debug(`uniqueTexts: ${uniqueTexts}`)
|
||
|
||
// 翻译,拆分为多个请求以避免超长
|
||
const chunkSize = 20
|
||
const chunks: string[][] = []
|
||
for (let i = 0; i < uniqueTexts.length; i += chunkSize) {
|
||
chunks.push(uniqueTexts.slice(i, i + chunkSize))
|
||
}
|
||
const reqList = []
|
||
for (const chunk of chunks) {
|
||
reqList.push(translateTextList(chunk, "translateHTML"))
|
||
}
|
||
|
||
const translatedMaps = await Promise.all(reqList)
|
||
|
||
const mergedMap: Record<string, string> = {}
|
||
translatedMaps.forEach((translatedMap) => {
|
||
Object.assign(mergedMap, translatedMap)
|
||
})
|
||
|
||
// 更新页面内容
|
||
targetNodes.forEach((ele) => {
|
||
const textNodes = Array.from(ele.childNodes).filter(
|
||
(node) => node.nodeType === 3
|
||
)
|
||
textNodes.forEach((textNode) => {
|
||
const text = textNode.textContent?.trim()
|
||
if (text) {
|
||
const translated = mergedMap[text]
|
||
if (translated) {
|
||
textNode.textContent = translated
|
||
}
|
||
}
|
||
})
|
||
})
|
||
|
||
// 将所有 <a> 标签的点击改成在新页面打开链接
|
||
const anchorNodes = document.querySelectorAll("a")
|
||
anchorNodes.forEach((anchor) => {
|
||
anchor.setAttribute("target", "_blank")
|
||
})
|
||
|
||
return dom.serialize()
|
||
}
|
||
|
||
/**
|
||
* 获取最新的页面ID
|
||
* @returns 最新的页面ID
|
||
*/
|
||
const getLatestId = async () => {
|
||
const current = await pbClient
|
||
.collection<ConfigModel>("env")
|
||
.getOne("5l8a8u85p5v4aid")
|
||
return current.value
|
||
}
|
||
|
||
/**
|
||
* 设置最新的页面ID
|
||
* @param id - 页面ID
|
||
*/
|
||
const setLatestId = async (id: number) => {
|
||
await pbClient
|
||
.collection<ConfigModel>("env")
|
||
.update("5l8a8u85p5v4aid", { value: id })
|
||
}
|
||
|
||
/**
|
||
* 写入HTML内容到文件
|
||
* @param html - HTML内容
|
||
*/
|
||
const writeHtml = async (html: string, version: number) => {
|
||
const client = new S3Client({
|
||
accessKeyId: APP_CONFIG.S3_MICHAT_AK,
|
||
secretAccessKey: APP_CONFIG.S3_MICHAT_SK,
|
||
region: "cnbj1",
|
||
endpoint: "https://s3-cnbj1.mi-fds.net",
|
||
bucket: "mi-chat-fe",
|
||
})
|
||
|
||
const s3file: S3File = client.file(`bytes/${version}.html`)
|
||
await s3file.write(html)
|
||
}
|
||
|
||
/**
|
||
* 监控并翻译最新的页面内容
|
||
*/
|
||
const byteMonitor = async () => {
|
||
const ctx = await genContextManually()
|
||
const { logger, larkService, appInfo } = ctx
|
||
logger.info("byteMonitor start")
|
||
try {
|
||
const latestId = await getLatestId()
|
||
if (!latestId) throw new Error("getLatestId empty")
|
||
const newId = Number(latestId) + 1
|
||
const newPage = await getNewPage(newId)
|
||
if (!newPage) throw new Error("getNewPage empty")
|
||
const translatedPage = await translateHTML(ctx, newPage)
|
||
await writeHtml(translatedPage, newId)
|
||
await setLatestId(newId)
|
||
await larkService.message.sendText2Chat(
|
||
appInfo.errChatId,
|
||
`页面链接:https://mi-chat-fe.cnbj1.mi-fds.com/mi-chat-fe/bytes/${newId}.html`,
|
||
"byteMonitor 更新"
|
||
)
|
||
} catch (error) {
|
||
const errorMessage = `byteMonitor error: ${error}`
|
||
logger.error(errorMessage)
|
||
} finally {
|
||
logger.info("byteMonitor finished")
|
||
}
|
||
}
|
||
|
||
export default byteMonitor
|