This commit is contained in:
Mrx
2026-03-13 18:07:24 +08:00
parent f16274d1e9
commit 209ffec51d
16 changed files with 309 additions and 1176 deletions

View File

@@ -90,6 +90,22 @@ func (tp *TextProcessor) StripHTML(text string) string {
return text
}
// HTMLToPlainWithBreaks 将 HTML 转为纯文本并保留富文本换行效果(<p><br><div> 等变为换行)
// 用于在 PDF 中还原段落与换行,避免内容挤成一团
func (tp *TextProcessor) HTMLToPlainWithBreaks(text string) string {
text = html.UnescapeString(text)
// 块级结束标签转为换行
text = regexp.MustCompile(`(?i)</(p|div|br|tr|li|h[1-6])>\s*`).ReplaceAllString(text, "\n")
// <br> 自闭合
text = regexp.MustCompile(`(?i)<br\s*/?>\s*`).ReplaceAllString(text, "\n")
// 剩余标签移除
text = regexp.MustCompile(`<[^>]+>`).ReplaceAllString(text, "")
// 连续空白/换行压缩为最多两个换行(段间距)
text = regexp.MustCompile(`[ \t]+`).ReplaceAllString(text, " ")
text = regexp.MustCompile(`\n{3,}`).ReplaceAllString(text, "\n\n")
return strings.TrimSpace(text)
}
// RemoveMarkdownSyntax 移除markdown语法保留纯文本
func (tp *TextProcessor) RemoveMarkdownSyntax(text string) string {
// 移除粗体标记 **text** 或 __text__