egg_server/schedule/byteMonitor.ts

238 lines
6.3 KiB
TypeScript
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import { S3Client, S3File } from "bun"
import { JSDOM } from "jsdom"
import initAppConfig, { APP_CONFIG, ConfigModel } from "../constant/config"
import pbClient from "../db/pbClient"
import { Context } from "../types"
import { genContextManually } from "../utils/genContext"
import llm from "../utils/llm"
await initAppConfig()
/**
* 批量翻译文本列表
* @param textList - 待翻译的文本列表
* @param requestId - 请求ID
* @returns 翻译后的文本映射
*/
const translateTextList = async (textList: string[], requestId: string) => {
const translatedTexts = await llm.invoke(
"batchTranslate",
{
inputArray: textList,
targetLang: "简体中文",
},
requestId,
1,
true
)
const translatedList = JSON.parse(translatedTexts as string) as string[]
const translatedMap: Record<string, string> = {}
textList.forEach((text, index) => {
translatedMap[text] = translatedList[index]
})
return translatedMap
}
/**
* 过滤不需要翻译的字符串
* @param str - 待处理的字符串
* @returns 是否需要翻译
*/
const getIsTranslatableString = (str: string): boolean => {
const trimmed: string = str.trim()
// 1. 排除空字符串或纯空白字符
if (trimmed === "") return false
// 2. 排除纯Emoji字符包括复合Emoji如👨👩👧👦
const isEmojiOnly: boolean = /^\p{Emoji}+$/u.test(trimmed)
if (isEmojiOnly) return false
// 3. 排除无字母/文字字符(允许包含数字但必须存在文字)
const hasLetters: boolean = /\p{L}/u.test(trimmed) // /
return hasLetters
}
/**
* 获取新的页面内容
* @param newId - 页面ID
* @returns 页面内容的HTML字符串
*/
const getNewPage = async (newId: number) => {
const url = `https://bytes.dev/archives/${newId}`
const res = await fetch(url)
if (!res.ok) {
return ""
}
const htmlContent = await res.text()
// 替换相对路径为绝对路径
const replacedContent = htmlContent.replace(
/(href|src)="\/(?!\/)/g,
`$1="https://bytes.dev/`
)
return replacedContent
}
/**
* 获取页面内容并翻译
* @param ctx - 上下文对象
* @param rawHtml - 原始HTML内容
* @returns 翻译后的HTML内容
*/
const translateHTML = async (ctx: Context, rawHtml: string) => {
const { logger } = ctx
const dom = new JSDOM(rawHtml)
const document = dom.window.document
// 移除 __NEXT_DATA__ 脚本
const nextDataEle = document.querySelector(
"script#__NEXT_DATA__"
) as HTMLScriptElement
nextDataEle.remove()
// 需要翻译的元素列表扩展selector需在此添加
const targetNodes = Array.from(
document.querySelectorAll(
"p:not(code *), h1:not(code *), h2:not(code *), h3:not(code *), h4:not(code *), h5:not(code *), h6:not(code *), span:not(code *), a:not(code *), li:not(code *), td, th:not(code *), caption:not(code *), button:not(code *), label:not(code *), title:not(code *)"
)
)
const needTranslateText: string[] = []
// 提取需要翻译的文本
targetNodes.forEach((ele) => {
const textNodes = Array.from(ele.childNodes).filter(
(node) => node.nodeType === 3
)
textNodes.forEach((textNode) => {
const text = textNode.textContent?.trim()
if (text && getIsTranslatableString(text)) {
needTranslateText.push(text)
}
})
})
const uniqueTexts = [...new Set(needTranslateText)]
logger.debug(`uniqueTexts: ${uniqueTexts}`)
// 翻译,拆分为多个请求以避免超长
const chunkSize = 20
const chunks: string[][] = []
for (let i = 0; i < uniqueTexts.length; i += chunkSize) {
chunks.push(uniqueTexts.slice(i, i + chunkSize))
}
const reqList = []
for (const chunk of chunks) {
reqList.push(translateTextList(chunk, "translateHTML"))
}
const translatedMaps = await Promise.all(reqList)
const mergedMap: Record<string, string> = {}
translatedMaps.forEach((translatedMap) => {
Object.assign(mergedMap, translatedMap)
})
// 更新页面内容
targetNodes.forEach((ele) => {
const textNodes = Array.from(ele.childNodes).filter(
(node) => node.nodeType === 3
)
textNodes.forEach((textNode) => {
const text = textNode.textContent?.trim()
if (text) {
const translated = mergedMap[text]
if (translated) {
textNode.textContent = translated
}
}
})
})
// 将所有 <a> 标签的点击改成在新页面打开链接
const anchorNodes = document.querySelectorAll("a")
anchorNodes.forEach((anchor) => {
anchor.setAttribute("target", "_blank")
})
return dom.serialize()
}
/**
* 获取最新的页面ID
* @returns 最新的页面ID
*/
const getLatestId = async () => {
const current = await pbClient
.collection<ConfigModel>("env")
.getOne("5l8a8u85p5v4aid")
return current.value
}
/**
* 设置最新的页面ID
* @param id - 页面ID
*/
const setLatestId = async (id: number) => {
await pbClient
.collection<ConfigModel>("env")
.update("5l8a8u85p5v4aid", { value: id })
}
/**
* 写入HTML内容到文件
* @param html - HTML内容
*/
const writeHtml = async (html: string, version: number) => {
const client = new S3Client({
accessKeyId: APP_CONFIG.S3_MICHAT_AK,
secretAccessKey: APP_CONFIG.S3_MICHAT_SK,
region: "cnbj1",
endpoint: "https://s3-cnbj1.mi-fds.net",
bucket: "mi-chat-fe",
})
const s3file: S3File = client.file(`bytes/${version}.html`)
await s3file.write(html)
}
/**
* 监控并翻译最新的页面内容
*/
const byteMonitor = async () => {
const ctx = await genContextManually()
const { logger, larkService, appInfo } = ctx
logger.info("byteMonitor start")
try {
const latestId = await getLatestId()
if (!latestId) throw new Error("getLatestId empty")
const newId = Number(latestId) + 1
const newPage = await getNewPage(newId)
if (!newPage) throw new Error("getNewPage empty")
const translatedPage = await translateHTML(ctx, newPage)
await writeHtml(translatedPage, newId)
await setLatestId(newId)
await larkService.message.sendText2Chat(
appInfo.errChatId,
`页面链接https://mi-chat-fe.cnbj1.mi-fds.com/mi-chat-fe/bytes/${newId}.html`,
"byteMonitor 更新"
)
} catch (error) {
const errorMessage = `byteMonitor error: ${error}`
logger.error(errorMessage)
} finally {
logger.info("byteMonitor finished")
}
}
export default byteMonitor