feat: 添加byteMonitor以监控并翻译最新页面内容,更新相关依赖
This commit is contained in:
parent
26e51e132b
commit
32396aecf1
1
.vscode/settings.json
vendored
1
.vscode/settings.json
vendored
@ -6,6 +6,7 @@
|
|||||||
"CEINTL",
|
"CEINTL",
|
||||||
"Chakroun",
|
"Chakroun",
|
||||||
"CICD",
|
"CICD",
|
||||||
|
"cnbj",
|
||||||
"commitlint",
|
"commitlint",
|
||||||
"dbaeumer",
|
"dbaeumer",
|
||||||
"deepseek",
|
"deepseek",
|
||||||
|
@ -20,6 +20,7 @@
|
|||||||
"@commitlint/cli": "^19.7.1",
|
"@commitlint/cli": "^19.7.1",
|
||||||
"@commitlint/config-conventional": "^19.7.1",
|
"@commitlint/config-conventional": "^19.7.1",
|
||||||
"@eslint/js": "^9.19.0",
|
"@eslint/js": "^9.19.0",
|
||||||
|
"@types/jsdom": "^21.1.7",
|
||||||
"@types/node-schedule": "^2.1.7",
|
"@types/node-schedule": "^2.1.7",
|
||||||
"@types/uuid": "^10.0.0",
|
"@types/uuid": "^10.0.0",
|
||||||
"bun-types": "^1.2.2",
|
"bun-types": "^1.2.2",
|
||||||
@ -45,6 +46,7 @@
|
|||||||
"@langchain/langgraph": "^0.2.44",
|
"@langchain/langgraph": "^0.2.44",
|
||||||
"@langchain/openai": "^0.3.17",
|
"@langchain/openai": "^0.3.17",
|
||||||
"joi": "^17.13.3",
|
"joi": "^17.13.3",
|
||||||
|
"jsdom": "^26.0.0",
|
||||||
"langfuse-langchain": "^3.35.1",
|
"langfuse-langchain": "^3.35.1",
|
||||||
"node-schedule": "^2.1.1",
|
"node-schedule": "^2.1.1",
|
||||||
"p-limit": "^6.2.0",
|
"p-limit": "^6.2.0",
|
||||||
|
237
schedule/byteMonitor.ts
Normal file
237
schedule/byteMonitor.ts
Normal file
@ -0,0 +1,237 @@
|
|||||||
|
import { S3Client, S3File } from "bun"
|
||||||
|
import { JSDOM } from "jsdom"
|
||||||
|
|
||||||
|
import initAppConfig, { APP_CONFIG, ConfigModel } from "../constant/config"
|
||||||
|
import pbClient from "../db/pbClient"
|
||||||
|
import { Context } from "../types"
|
||||||
|
import { genContextManually } from "../utils/genContext"
|
||||||
|
import llm from "../utils/llm"
|
||||||
|
|
||||||
|
await initAppConfig()
|
||||||
|
|
||||||
|
/**
|
||||||
|
* 批量翻译文本列表
|
||||||
|
* @param textList - 待翻译的文本列表
|
||||||
|
* @param requestId - 请求ID
|
||||||
|
* @returns 翻译后的文本映射
|
||||||
|
*/
|
||||||
|
const translateTextList = async (textList: string[], requestId: string) => {
|
||||||
|
const translatedTexts = await llm.invoke(
|
||||||
|
"batchTranslate",
|
||||||
|
{
|
||||||
|
inputArray: textList,
|
||||||
|
targetLang: "简体中文",
|
||||||
|
},
|
||||||
|
requestId,
|
||||||
|
1,
|
||||||
|
true
|
||||||
|
)
|
||||||
|
|
||||||
|
const translatedList = JSON.parse(translatedTexts as string) as string[]
|
||||||
|
|
||||||
|
const translatedMap: Record<string, string> = {}
|
||||||
|
textList.forEach((text, index) => {
|
||||||
|
translatedMap[text] = translatedList[index]
|
||||||
|
})
|
||||||
|
|
||||||
|
return translatedMap
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* 过滤不需要翻译的字符串
|
||||||
|
* @param str - 待处理的字符串
|
||||||
|
* @returns 是否需要翻译
|
||||||
|
*/
|
||||||
|
const getIsTranslatableString = (str: string): boolean => {
|
||||||
|
const trimmed: string = str.trim()
|
||||||
|
|
||||||
|
// 1. 排除空字符串或纯空白字符
|
||||||
|
if (trimmed === "") return false
|
||||||
|
|
||||||
|
// 2. 排除纯Emoji字符(包括复合Emoji如👨👩👧👦)
|
||||||
|
const isEmojiOnly: boolean = /^\p{Emoji}+$/u.test(trimmed)
|
||||||
|
if (isEmojiOnly) return false
|
||||||
|
|
||||||
|
// 3. 排除无字母/文字字符(允许包含数字但必须存在文字)
|
||||||
|
const hasLetters: boolean = /\p{L}/u.test(trimmed) // 匹配任意语言字母/文字
|
||||||
|
return hasLetters
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* 获取新的页面内容
|
||||||
|
* @param newId - 页面ID
|
||||||
|
* @returns 页面内容的HTML字符串
|
||||||
|
*/
|
||||||
|
const getNewPage = async (newId: number) => {
|
||||||
|
const url = `https://bytes.dev/archives/${newId}`
|
||||||
|
const res = await fetch(url)
|
||||||
|
|
||||||
|
if (!res.ok) {
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
|
||||||
|
const htmlContent = await res.text()
|
||||||
|
|
||||||
|
// 替换相对路径为绝对路径
|
||||||
|
const replacedContent = htmlContent.replace(
|
||||||
|
/(href|src)="\/(?!\/)/g,
|
||||||
|
`$1="https://bytes.dev/`
|
||||||
|
)
|
||||||
|
|
||||||
|
return replacedContent
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* 获取页面内容并翻译
|
||||||
|
* @param ctx - 上下文对象
|
||||||
|
* @param rawHtml - 原始HTML内容
|
||||||
|
* @returns 翻译后的HTML内容
|
||||||
|
*/
|
||||||
|
const translateHTML = async (ctx: Context, rawHtml: string) => {
|
||||||
|
const { logger } = ctx
|
||||||
|
const dom = new JSDOM(rawHtml)
|
||||||
|
const document = dom.window.document
|
||||||
|
|
||||||
|
// 移除 __NEXT_DATA__ 脚本
|
||||||
|
const nextDataEle = document.querySelector(
|
||||||
|
"script#__NEXT_DATA__"
|
||||||
|
) as HTMLScriptElement
|
||||||
|
nextDataEle.remove()
|
||||||
|
|
||||||
|
// 需要翻译的元素列表(扩展selector需在此添加)
|
||||||
|
const targetNodes = Array.from(
|
||||||
|
document.querySelectorAll(
|
||||||
|
"p:not(code *), h1:not(code *), h2:not(code *), h3:not(code *), h4:not(code *), h5:not(code *), h6:not(code *), span:not(code *), a:not(code *), li:not(code *), td, th:not(code *), caption:not(code *), button:not(code *), label:not(code *), title:not(code *)"
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
const needTranslateText: string[] = []
|
||||||
|
|
||||||
|
// 提取需要翻译的文本
|
||||||
|
targetNodes.forEach((ele) => {
|
||||||
|
const textNodes = Array.from(ele.childNodes).filter(
|
||||||
|
(node) => node.nodeType === 3
|
||||||
|
)
|
||||||
|
textNodes.forEach((textNode) => {
|
||||||
|
const text = textNode.textContent?.trim()
|
||||||
|
if (text && getIsTranslatableString(text)) {
|
||||||
|
needTranslateText.push(text)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
})
|
||||||
|
|
||||||
|
const uniqueTexts = [...new Set(needTranslateText)]
|
||||||
|
|
||||||
|
logger.debug(`uniqueTexts: ${uniqueTexts}`)
|
||||||
|
|
||||||
|
// 翻译,拆分为多个请求以避免超长
|
||||||
|
const chunkSize = 20
|
||||||
|
const chunks: string[][] = []
|
||||||
|
for (let i = 0; i < uniqueTexts.length; i += chunkSize) {
|
||||||
|
chunks.push(uniqueTexts.slice(i, i + chunkSize))
|
||||||
|
}
|
||||||
|
const reqList = []
|
||||||
|
for (const chunk of chunks) {
|
||||||
|
reqList.push(translateTextList(chunk, "translateHTML"))
|
||||||
|
}
|
||||||
|
|
||||||
|
const translatedMaps = await Promise.all(reqList)
|
||||||
|
|
||||||
|
const mergedMap: Record<string, string> = {}
|
||||||
|
translatedMaps.forEach((translatedMap) => {
|
||||||
|
Object.assign(mergedMap, translatedMap)
|
||||||
|
})
|
||||||
|
|
||||||
|
// 更新页面内容
|
||||||
|
targetNodes.forEach((ele) => {
|
||||||
|
const textNodes = Array.from(ele.childNodes).filter(
|
||||||
|
(node) => node.nodeType === 3
|
||||||
|
)
|
||||||
|
textNodes.forEach((textNode) => {
|
||||||
|
const text = textNode.textContent?.trim()
|
||||||
|
if (text) {
|
||||||
|
const translated = mergedMap[text]
|
||||||
|
if (translated) {
|
||||||
|
textNode.textContent = translated
|
||||||
|
}
|
||||||
|
}
|
||||||
|
})
|
||||||
|
})
|
||||||
|
|
||||||
|
// 将所有 <a> 标签的点击改成在新页面打开链接
|
||||||
|
const anchorNodes = document.querySelectorAll("a")
|
||||||
|
anchorNodes.forEach((anchor) => {
|
||||||
|
anchor.setAttribute("target", "_blank")
|
||||||
|
})
|
||||||
|
|
||||||
|
return dom.serialize()
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* 获取最新的页面ID
|
||||||
|
* @returns 最新的页面ID
|
||||||
|
*/
|
||||||
|
const getLatestId = async () => {
|
||||||
|
const current = await pbClient
|
||||||
|
.collection<ConfigModel>("env")
|
||||||
|
.getOne("5l8a8u85p5v4aid")
|
||||||
|
return current.value
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* 设置最新的页面ID
|
||||||
|
* @param id - 页面ID
|
||||||
|
*/
|
||||||
|
const setLatestId = async (id: number) => {
|
||||||
|
await pbClient
|
||||||
|
.collection<ConfigModel>("env")
|
||||||
|
.update("5l8a8u85p5v4aid", { value: id })
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* 写入HTML内容到文件
|
||||||
|
* @param html - HTML内容
|
||||||
|
*/
|
||||||
|
const writeHtml = async (html: string, version: number) => {
|
||||||
|
const client = new S3Client({
|
||||||
|
accessKeyId: APP_CONFIG.S3_MICHAT_AK,
|
||||||
|
secretAccessKey: APP_CONFIG.S3_MICHAT_SK,
|
||||||
|
region: "cnbj1",
|
||||||
|
endpoint: "https://s3-cnbj1.mi-fds.net",
|
||||||
|
bucket: "mi-chat-fe",
|
||||||
|
})
|
||||||
|
|
||||||
|
const s3file: S3File = client.file(`bytes/${version}.html`)
|
||||||
|
await s3file.write(html)
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* 监控并翻译最新的页面内容
|
||||||
|
*/
|
||||||
|
const byteMonitor = async () => {
|
||||||
|
const ctx = await genContextManually()
|
||||||
|
const { logger, larkService, appInfo } = ctx
|
||||||
|
logger.info("byteMonitor start")
|
||||||
|
try {
|
||||||
|
const latestId = await getLatestId()
|
||||||
|
if (!latestId) throw new Error("getLatestId empty")
|
||||||
|
const newId = Number(latestId) + 1
|
||||||
|
const newPage = await getNewPage(newId)
|
||||||
|
if (!newPage) throw new Error("getNewPage empty")
|
||||||
|
const translatedPage = await translateHTML(ctx, newPage)
|
||||||
|
await writeHtml(translatedPage, newId)
|
||||||
|
await setLatestId(newId)
|
||||||
|
await larkService.message.sendText2Chat(
|
||||||
|
appInfo.errChatId,
|
||||||
|
`页面链接:https://mi-chat-fe.cnbj1.mi-fds.com/mi-chat-fe/bytes/${newId}.html`,
|
||||||
|
"byteMonitor 更新"
|
||||||
|
)
|
||||||
|
} catch (error) {
|
||||||
|
const errorMessage = `byteMonitor error: ${error}`
|
||||||
|
logger.error(errorMessage)
|
||||||
|
} finally {
|
||||||
|
logger.info("byteMonitor finished")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
export default byteMonitor
|
@ -2,6 +2,7 @@ import schedule from "node-schedule"
|
|||||||
|
|
||||||
import report from "../controller/groupAgent/report"
|
import report from "../controller/groupAgent/report"
|
||||||
import { loginPbClient } from "../db/pbClient"
|
import { loginPbClient } from "../db/pbClient"
|
||||||
|
import byteMonitor from "./byteMonitor"
|
||||||
import fmMonitor from "./fmMonitor"
|
import fmMonitor from "./fmMonitor"
|
||||||
import sendZhongNotify from "./zhongNotify"
|
import sendZhongNotify from "./zhongNotify"
|
||||||
|
|
||||||
@ -20,4 +21,7 @@ export const initSchedule = async () => {
|
|||||||
|
|
||||||
// 定时任务,每小时执行一次
|
// 定时任务,每小时执行一次
|
||||||
schedule.scheduleJob("0 * * * *", loginPbClient)
|
schedule.scheduleJob("0 * * * *", loginPbClient)
|
||||||
|
|
||||||
|
// 定时任务,每小时执行一次
|
||||||
|
schedule.scheduleJob("0 * * * *", byteMonitor)
|
||||||
}
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user