f
This commit is contained in:
@@ -90,6 +90,22 @@ func (tp *TextProcessor) StripHTML(text string) string {
|
||||
return text
|
||||
}
|
||||
|
||||
// HTMLToPlainWithBreaks 将 HTML 转为纯文本并保留富文本换行效果(<p><br><div> 等变为换行)
|
||||
// 用于在 PDF 中还原段落与换行,避免内容挤成一团
|
||||
func (tp *TextProcessor) HTMLToPlainWithBreaks(text string) string {
|
||||
text = html.UnescapeString(text)
|
||||
// 块级结束标签转为换行
|
||||
text = regexp.MustCompile(`(?i)</(p|div|br|tr|li|h[1-6])>\s*`).ReplaceAllString(text, "\n")
|
||||
// <br> 自闭合
|
||||
text = regexp.MustCompile(`(?i)<br\s*/?>\s*`).ReplaceAllString(text, "\n")
|
||||
// 剩余标签移除
|
||||
text = regexp.MustCompile(`<[^>]+>`).ReplaceAllString(text, "")
|
||||
// 连续空白/换行压缩为最多两个换行(段间距)
|
||||
text = regexp.MustCompile(`[ \t]+`).ReplaceAllString(text, " ")
|
||||
text = regexp.MustCompile(`\n{3,}`).ReplaceAllString(text, "\n\n")
|
||||
return strings.TrimSpace(text)
|
||||
}
|
||||
|
||||
// RemoveMarkdownSyntax 移除markdown语法,保留纯文本
|
||||
func (tp *TextProcessor) RemoveMarkdownSyntax(text string) string {
|
||||
// 移除粗体标记 **text** 或 __text__
|
||||
|
||||
Reference in New Issue
Block a user