feat: 添加byteMonitor以监控并翻译最新页面内容,更新相关依赖
This commit is contained in:
parent
26e51e132b
commit
32396aecf1
1
.vscode/settings.json
vendored
1
.vscode/settings.json
vendored
@ -6,6 +6,7 @@
|
||||
"CEINTL",
|
||||
"Chakroun",
|
||||
"CICD",
|
||||
"cnbj",
|
||||
"commitlint",
|
||||
"dbaeumer",
|
||||
"deepseek",
|
||||
|
@ -20,6 +20,7 @@
|
||||
"@commitlint/cli": "^19.7.1",
|
||||
"@commitlint/config-conventional": "^19.7.1",
|
||||
"@eslint/js": "^9.19.0",
|
||||
"@types/jsdom": "^21.1.7",
|
||||
"@types/node-schedule": "^2.1.7",
|
||||
"@types/uuid": "^10.0.0",
|
||||
"bun-types": "^1.2.2",
|
||||
@ -45,6 +46,7 @@
|
||||
"@langchain/langgraph": "^0.2.44",
|
||||
"@langchain/openai": "^0.3.17",
|
||||
"joi": "^17.13.3",
|
||||
"jsdom": "^26.0.0",
|
||||
"langfuse-langchain": "^3.35.1",
|
||||
"node-schedule": "^2.1.1",
|
||||
"p-limit": "^6.2.0",
|
||||
|
237
schedule/byteMonitor.ts
Normal file
237
schedule/byteMonitor.ts
Normal file
@ -0,0 +1,237 @@
|
||||
import { S3Client, S3File } from "bun"
|
||||
import { JSDOM } from "jsdom"
|
||||
|
||||
import initAppConfig, { APP_CONFIG, ConfigModel } from "../constant/config"
|
||||
import pbClient from "../db/pbClient"
|
||||
import { Context } from "../types"
|
||||
import { genContextManually } from "../utils/genContext"
|
||||
import llm from "../utils/llm"
|
||||
|
||||
await initAppConfig()
|
||||
|
||||
/**
|
||||
* 批量翻译文本列表
|
||||
* @param textList - 待翻译的文本列表
|
||||
* @param requestId - 请求ID
|
||||
* @returns 翻译后的文本映射
|
||||
*/
|
||||
const translateTextList = async (textList: string[], requestId: string) => {
|
||||
const translatedTexts = await llm.invoke(
|
||||
"batchTranslate",
|
||||
{
|
||||
inputArray: textList,
|
||||
targetLang: "简体中文",
|
||||
},
|
||||
requestId,
|
||||
1,
|
||||
true
|
||||
)
|
||||
|
||||
const translatedList = JSON.parse(translatedTexts as string) as string[]
|
||||
|
||||
const translatedMap: Record<string, string> = {}
|
||||
textList.forEach((text, index) => {
|
||||
translatedMap[text] = translatedList[index]
|
||||
})
|
||||
|
||||
return translatedMap
|
||||
}
|
||||
|
||||
/**
|
||||
* 过滤不需要翻译的字符串
|
||||
* @param str - 待处理的字符串
|
||||
* @returns 是否需要翻译
|
||||
*/
|
||||
const getIsTranslatableString = (str: string): boolean => {
|
||||
const trimmed: string = str.trim()
|
||||
|
||||
// 1. 排除空字符串或纯空白字符
|
||||
if (trimmed === "") return false
|
||||
|
||||
// 2. 排除纯Emoji字符(包括复合Emoji如👨👩👧👦)
|
||||
const isEmojiOnly: boolean = /^\p{Emoji}+$/u.test(trimmed)
|
||||
if (isEmojiOnly) return false
|
||||
|
||||
// 3. 排除无字母/文字字符(允许包含数字但必须存在文字)
|
||||
const hasLetters: boolean = /\p{L}/u.test(trimmed) // 匹配任意语言字母/文字
|
||||
return hasLetters
|
||||
}
|
||||
|
||||
/**
|
||||
* 获取新的页面内容
|
||||
* @param newId - 页面ID
|
||||
* @returns 页面内容的HTML字符串
|
||||
*/
|
||||
const getNewPage = async (newId: number) => {
|
||||
const url = `https://bytes.dev/archives/${newId}`
|
||||
const res = await fetch(url)
|
||||
|
||||
if (!res.ok) {
|
||||
return ""
|
||||
}
|
||||
|
||||
const htmlContent = await res.text()
|
||||
|
||||
// 替换相对路径为绝对路径
|
||||
const replacedContent = htmlContent.replace(
|
||||
/(href|src)="\/(?!\/)/g,
|
||||
`$1="https://bytes.dev/`
|
||||
)
|
||||
|
||||
return replacedContent
|
||||
}
|
||||
|
||||
/**
|
||||
* 获取页面内容并翻译
|
||||
* @param ctx - 上下文对象
|
||||
* @param rawHtml - 原始HTML内容
|
||||
* @returns 翻译后的HTML内容
|
||||
*/
|
||||
const translateHTML = async (ctx: Context, rawHtml: string) => {
|
||||
const { logger } = ctx
|
||||
const dom = new JSDOM(rawHtml)
|
||||
const document = dom.window.document
|
||||
|
||||
// 移除 __NEXT_DATA__ 脚本
|
||||
const nextDataEle = document.querySelector(
|
||||
"script#__NEXT_DATA__"
|
||||
) as HTMLScriptElement
|
||||
nextDataEle.remove()
|
||||
|
||||
// 需要翻译的元素列表(扩展selector需在此添加)
|
||||
const targetNodes = Array.from(
|
||||
document.querySelectorAll(
|
||||
"p:not(code *), h1:not(code *), h2:not(code *), h3:not(code *), h4:not(code *), h5:not(code *), h6:not(code *), span:not(code *), a:not(code *), li:not(code *), td, th:not(code *), caption:not(code *), button:not(code *), label:not(code *), title:not(code *)"
|
||||
)
|
||||
)
|
||||
|
||||
const needTranslateText: string[] = []
|
||||
|
||||
// 提取需要翻译的文本
|
||||
targetNodes.forEach((ele) => {
|
||||
const textNodes = Array.from(ele.childNodes).filter(
|
||||
(node) => node.nodeType === 3
|
||||
)
|
||||
textNodes.forEach((textNode) => {
|
||||
const text = textNode.textContent?.trim()
|
||||
if (text && getIsTranslatableString(text)) {
|
||||
needTranslateText.push(text)
|
||||
}
|
||||
})
|
||||
})
|
||||
|
||||
const uniqueTexts = [...new Set(needTranslateText)]
|
||||
|
||||
logger.debug(`uniqueTexts: ${uniqueTexts}`)
|
||||
|
||||
// 翻译,拆分为多个请求以避免超长
|
||||
const chunkSize = 20
|
||||
const chunks: string[][] = []
|
||||
for (let i = 0; i < uniqueTexts.length; i += chunkSize) {
|
||||
chunks.push(uniqueTexts.slice(i, i + chunkSize))
|
||||
}
|
||||
const reqList = []
|
||||
for (const chunk of chunks) {
|
||||
reqList.push(translateTextList(chunk, "translateHTML"))
|
||||
}
|
||||
|
||||
const translatedMaps = await Promise.all(reqList)
|
||||
|
||||
const mergedMap: Record<string, string> = {}
|
||||
translatedMaps.forEach((translatedMap) => {
|
||||
Object.assign(mergedMap, translatedMap)
|
||||
})
|
||||
|
||||
// 更新页面内容
|
||||
targetNodes.forEach((ele) => {
|
||||
const textNodes = Array.from(ele.childNodes).filter(
|
||||
(node) => node.nodeType === 3
|
||||
)
|
||||
textNodes.forEach((textNode) => {
|
||||
const text = textNode.textContent?.trim()
|
||||
if (text) {
|
||||
const translated = mergedMap[text]
|
||||
if (translated) {
|
||||
textNode.textContent = translated
|
||||
}
|
||||
}
|
||||
})
|
||||
})
|
||||
|
||||
// 将所有 <a> 标签的点击改成在新页面打开链接
|
||||
const anchorNodes = document.querySelectorAll("a")
|
||||
anchorNodes.forEach((anchor) => {
|
||||
anchor.setAttribute("target", "_blank")
|
||||
})
|
||||
|
||||
return dom.serialize()
|
||||
}
|
||||
|
||||
/**
|
||||
* 获取最新的页面ID
|
||||
* @returns 最新的页面ID
|
||||
*/
|
||||
const getLatestId = async () => {
|
||||
const current = await pbClient
|
||||
.collection<ConfigModel>("env")
|
||||
.getOne("5l8a8u85p5v4aid")
|
||||
return current.value
|
||||
}
|
||||
|
||||
/**
|
||||
* 设置最新的页面ID
|
||||
* @param id - 页面ID
|
||||
*/
|
||||
const setLatestId = async (id: number) => {
|
||||
await pbClient
|
||||
.collection<ConfigModel>("env")
|
||||
.update("5l8a8u85p5v4aid", { value: id })
|
||||
}
|
||||
|
||||
/**
|
||||
* 写入HTML内容到文件
|
||||
* @param html - HTML内容
|
||||
*/
|
||||
const writeHtml = async (html: string, version: number) => {
|
||||
const client = new S3Client({
|
||||
accessKeyId: APP_CONFIG.S3_MICHAT_AK,
|
||||
secretAccessKey: APP_CONFIG.S3_MICHAT_SK,
|
||||
region: "cnbj1",
|
||||
endpoint: "https://s3-cnbj1.mi-fds.net",
|
||||
bucket: "mi-chat-fe",
|
||||
})
|
||||
|
||||
const s3file: S3File = client.file(`bytes/${version}.html`)
|
||||
await s3file.write(html)
|
||||
}
|
||||
|
||||
/**
|
||||
* 监控并翻译最新的页面内容
|
||||
*/
|
||||
const byteMonitor = async () => {
|
||||
const ctx = await genContextManually()
|
||||
const { logger, larkService, appInfo } = ctx
|
||||
logger.info("byteMonitor start")
|
||||
try {
|
||||
const latestId = await getLatestId()
|
||||
if (!latestId) throw new Error("getLatestId empty")
|
||||
const newId = Number(latestId) + 1
|
||||
const newPage = await getNewPage(newId)
|
||||
if (!newPage) throw new Error("getNewPage empty")
|
||||
const translatedPage = await translateHTML(ctx, newPage)
|
||||
await writeHtml(translatedPage, newId)
|
||||
await setLatestId(newId)
|
||||
await larkService.message.sendText2Chat(
|
||||
appInfo.errChatId,
|
||||
`页面链接:https://mi-chat-fe.cnbj1.mi-fds.com/mi-chat-fe/bytes/${newId}.html`,
|
||||
"byteMonitor 更新"
|
||||
)
|
||||
} catch (error) {
|
||||
const errorMessage = `byteMonitor error: ${error}`
|
||||
logger.error(errorMessage)
|
||||
} finally {
|
||||
logger.info("byteMonitor finished")
|
||||
}
|
||||
}
|
||||
|
||||
export default byteMonitor
|
@ -2,6 +2,7 @@ import schedule from "node-schedule"
|
||||
|
||||
import report from "../controller/groupAgent/report"
|
||||
import { loginPbClient } from "../db/pbClient"
|
||||
import byteMonitor from "./byteMonitor"
|
||||
import fmMonitor from "./fmMonitor"
|
||||
import sendZhongNotify from "./zhongNotify"
|
||||
|
||||
@ -20,4 +21,7 @@ export const initSchedule = async () => {
|
||||
|
||||
// 定时任务,每小时执行一次
|
||||
schedule.scheduleJob("0 * * * *", loginPbClient)
|
||||
|
||||
// 定时任务,每小时执行一次
|
||||
schedule.scheduleJob("0 * * * *", byteMonitor)
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user