feat: 添加byteMonitor以监控并翻译最新页面内容,更新相关依赖

This commit is contained in:
zhaoyingbo 2025-02-13 08:10:57 +00:00
parent 26e51e132b
commit 32396aecf1
5 changed files with 244 additions and 0 deletions

View File

@ -6,6 +6,7 @@
"CEINTL",
"Chakroun",
"CICD",
"cnbj",
"commitlint",
"dbaeumer",
"deepseek",

BIN
bun.lockb

Binary file not shown.

View File

@ -20,6 +20,7 @@
"@commitlint/cli": "^19.7.1",
"@commitlint/config-conventional": "^19.7.1",
"@eslint/js": "^9.19.0",
"@types/jsdom": "^21.1.7",
"@types/node-schedule": "^2.1.7",
"@types/uuid": "^10.0.0",
"bun-types": "^1.2.2",
@ -45,6 +46,7 @@
"@langchain/langgraph": "^0.2.44",
"@langchain/openai": "^0.3.17",
"joi": "^17.13.3",
"jsdom": "^26.0.0",
"langfuse-langchain": "^3.35.1",
"node-schedule": "^2.1.1",
"p-limit": "^6.2.0",

237
schedule/byteMonitor.ts Normal file
View File

@ -0,0 +1,237 @@
import { S3Client, S3File } from "bun"
import { JSDOM } from "jsdom"
import initAppConfig, { APP_CONFIG, ConfigModel } from "../constant/config"
import pbClient from "../db/pbClient"
import { Context } from "../types"
import { genContextManually } from "../utils/genContext"
import llm from "../utils/llm"
await initAppConfig()
/**
*
* @param textList -
* @param requestId - ID
* @returns
*/
const translateTextList = async (textList: string[], requestId: string) => {
const translatedTexts = await llm.invoke(
"batchTranslate",
{
inputArray: textList,
targetLang: "简体中文",
},
requestId,
1,
true
)
const translatedList = JSON.parse(translatedTexts as string) as string[]
const translatedMap: Record<string, string> = {}
textList.forEach((text, index) => {
translatedMap[text] = translatedList[index]
})
return translatedMap
}
/**
*
* @param str -
* @returns
*/
const getIsTranslatableString = (str: string): boolean => {
const trimmed: string = str.trim()
// 1. 排除空字符串或纯空白字符
if (trimmed === "") return false
// 2. 排除纯Emoji字符包括复合Emoji如👨👩👧👦
const isEmojiOnly: boolean = /^\p{Emoji}+$/u.test(trimmed)
if (isEmojiOnly) return false
// 3. 排除无字母/文字字符(允许包含数字但必须存在文字)
const hasLetters: boolean = /\p{L}/u.test(trimmed) // /
return hasLetters
}
/**
*
* @param newId - ID
* @returns HTML字符串
*/
const getNewPage = async (newId: number) => {
const url = `https://bytes.dev/archives/${newId}`
const res = await fetch(url)
if (!res.ok) {
return ""
}
const htmlContent = await res.text()
// 替换相对路径为绝对路径
const replacedContent = htmlContent.replace(
/(href|src)="\/(?!\/)/g,
`$1="https://bytes.dev/`
)
return replacedContent
}
/**
*
* @param ctx -
* @param rawHtml - HTML内容
* @returns HTML内容
*/
const translateHTML = async (ctx: Context, rawHtml: string) => {
const { logger } = ctx
const dom = new JSDOM(rawHtml)
const document = dom.window.document
// 移除 __NEXT_DATA__ 脚本
const nextDataEle = document.querySelector(
"script#__NEXT_DATA__"
) as HTMLScriptElement
nextDataEle.remove()
// 需要翻译的元素列表扩展selector需在此添加
const targetNodes = Array.from(
document.querySelectorAll(
"p:not(code *), h1:not(code *), h2:not(code *), h3:not(code *), h4:not(code *), h5:not(code *), h6:not(code *), span:not(code *), a:not(code *), li:not(code *), td, th:not(code *), caption:not(code *), button:not(code *), label:not(code *), title:not(code *)"
)
)
const needTranslateText: string[] = []
// 提取需要翻译的文本
targetNodes.forEach((ele) => {
const textNodes = Array.from(ele.childNodes).filter(
(node) => node.nodeType === 3
)
textNodes.forEach((textNode) => {
const text = textNode.textContent?.trim()
if (text && getIsTranslatableString(text)) {
needTranslateText.push(text)
}
})
})
const uniqueTexts = [...new Set(needTranslateText)]
logger.debug(`uniqueTexts: ${uniqueTexts}`)
// 翻译,拆分为多个请求以避免超长
const chunkSize = 20
const chunks: string[][] = []
for (let i = 0; i < uniqueTexts.length; i += chunkSize) {
chunks.push(uniqueTexts.slice(i, i + chunkSize))
}
const reqList = []
for (const chunk of chunks) {
reqList.push(translateTextList(chunk, "translateHTML"))
}
const translatedMaps = await Promise.all(reqList)
const mergedMap: Record<string, string> = {}
translatedMaps.forEach((translatedMap) => {
Object.assign(mergedMap, translatedMap)
})
// 更新页面内容
targetNodes.forEach((ele) => {
const textNodes = Array.from(ele.childNodes).filter(
(node) => node.nodeType === 3
)
textNodes.forEach((textNode) => {
const text = textNode.textContent?.trim()
if (text) {
const translated = mergedMap[text]
if (translated) {
textNode.textContent = translated
}
}
})
})
// 将所有 <a> 标签的点击改成在新页面打开链接
const anchorNodes = document.querySelectorAll("a")
anchorNodes.forEach((anchor) => {
anchor.setAttribute("target", "_blank")
})
return dom.serialize()
}
/**
* ID
* @returns ID
*/
const getLatestId = async () => {
const current = await pbClient
.collection<ConfigModel>("env")
.getOne("5l8a8u85p5v4aid")
return current.value
}
/**
* ID
* @param id - ID
*/
const setLatestId = async (id: number) => {
await pbClient
.collection<ConfigModel>("env")
.update("5l8a8u85p5v4aid", { value: id })
}
/**
* HTML内容到文件
* @param html - HTML内容
*/
const writeHtml = async (html: string, version: number) => {
const client = new S3Client({
accessKeyId: APP_CONFIG.S3_MICHAT_AK,
secretAccessKey: APP_CONFIG.S3_MICHAT_SK,
region: "cnbj1",
endpoint: "https://s3-cnbj1.mi-fds.net",
bucket: "mi-chat-fe",
})
const s3file: S3File = client.file(`bytes/${version}.html`)
await s3file.write(html)
}
/**
*
*/
const byteMonitor = async () => {
const ctx = await genContextManually()
const { logger, larkService, appInfo } = ctx
logger.info("byteMonitor start")
try {
const latestId = await getLatestId()
if (!latestId) throw new Error("getLatestId empty")
const newId = Number(latestId) + 1
const newPage = await getNewPage(newId)
if (!newPage) throw new Error("getNewPage empty")
const translatedPage = await translateHTML(ctx, newPage)
await writeHtml(translatedPage, newId)
await setLatestId(newId)
await larkService.message.sendText2Chat(
appInfo.errChatId,
`页面链接https://mi-chat-fe.cnbj1.mi-fds.com/mi-chat-fe/bytes/${newId}.html`,
"byteMonitor 更新"
)
} catch (error) {
const errorMessage = `byteMonitor error: ${error}`
logger.error(errorMessage)
} finally {
logger.info("byteMonitor finished")
}
}
export default byteMonitor

View File

@ -2,6 +2,7 @@ import schedule from "node-schedule"
import report from "../controller/groupAgent/report"
import { loginPbClient } from "../db/pbClient"
import byteMonitor from "./byteMonitor"
import fmMonitor from "./fmMonitor"
import sendZhongNotify from "./zhongNotify"
@ -20,4 +21,7 @@ export const initSchedule = async () => {
// 定时任务,每小时执行一次
schedule.scheduleJob("0 * * * *", loginPbClient)
// 定时任务,每小时执行一次
schedule.scheduleJob("0 * * * *", byteMonitor)
}