2025-12-03 12:03:42 +08:00
|
|
|
|
package pdf
|
|
|
|
|
|
|
|
|
|
|
|
import (
|
|
|
|
|
|
"html"
|
|
|
|
|
|
"regexp"
|
|
|
|
|
|
"strings"
|
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
// TextProcessor 文本处理器
|
|
|
|
|
|
type TextProcessor struct{}
|
|
|
|
|
|
|
|
|
|
|
|
// NewTextProcessor 创建文本处理器
|
|
|
|
|
|
func NewTextProcessor() *TextProcessor {
|
|
|
|
|
|
return &TextProcessor{}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// CleanText 清理文本中的无效字符和乱码
|
|
|
|
|
|
func (tp *TextProcessor) CleanText(text string) string {
|
|
|
|
|
|
// 先解码HTML实体
|
|
|
|
|
|
text = html.UnescapeString(text)
|
|
|
|
|
|
|
|
|
|
|
|
// 移除或替换无效的UTF-8字符
|
|
|
|
|
|
var result strings.Builder
|
|
|
|
|
|
for _, r := range text {
|
|
|
|
|
|
// 保留:中文字符、英文字母、数字、常见标点符号、空格、换行符等
|
|
|
|
|
|
if (r >= 0x4E00 && r <= 0x9FFF) || // 中文字符范围
|
|
|
|
|
|
(r >= 0x3400 && r <= 0x4DBF) || // 扩展A
|
|
|
|
|
|
(r >= 0x20000 && r <= 0x2A6DF) || // 扩展B
|
|
|
|
|
|
(r >= 'A' && r <= 'Z') || // 大写字母
|
|
|
|
|
|
(r >= 'a' && r <= 'z') || // 小写字母
|
|
|
|
|
|
(r >= '0' && r <= '9') || // 数字
|
|
|
|
|
|
(r >= 0x0020 && r <= 0x007E) || // ASCII可打印字符
|
|
|
|
|
|
(r == '\n' || r == '\r' || r == '\t') || // 换行和制表符
|
|
|
|
|
|
(r >= 0x3000 && r <= 0x303F) || // CJK符号和标点
|
|
|
|
|
|
(r >= 0xFF00 && r <= 0xFFEF) { // 全角字符
|
|
|
|
|
|
result.WriteRune(r)
|
|
|
|
|
|
} else if r > 0x007F && r < 0x00A0 {
|
|
|
|
|
|
// 无效的控制字符,替换为空格
|
|
|
|
|
|
result.WriteRune(' ')
|
|
|
|
|
|
}
|
|
|
|
|
|
// 其他字符(如乱码)直接跳过
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
return result.String()
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// CleanTextPreservingMarkdown 清理文本但保留markdown语法字符
|
|
|
|
|
|
func (tp *TextProcessor) CleanTextPreservingMarkdown(text string) string {
|
|
|
|
|
|
// 先解码HTML实体
|
|
|
|
|
|
text = html.UnescapeString(text)
|
|
|
|
|
|
|
|
|
|
|
|
// 移除或替换无效的UTF-8字符,但保留markdown语法字符
|
|
|
|
|
|
var result strings.Builder
|
|
|
|
|
|
for _, r := range text {
|
|
|
|
|
|
// 保留:中文字符、英文字母、数字、常见标点符号、空格、换行符等
|
|
|
|
|
|
// 特别保留markdown语法字符:* _ ` [ ] ( ) # - | : !
|
|
|
|
|
|
if (r >= 0x4E00 && r <= 0x9FFF) || // 中文字符范围
|
|
|
|
|
|
(r >= 0x3400 && r <= 0x4DBF) || // 扩展A
|
|
|
|
|
|
(r >= 0x20000 && r <= 0x2A6DF) || // 扩展B
|
|
|
|
|
|
(r >= 'A' && r <= 'Z') || // 大写字母
|
|
|
|
|
|
(r >= 'a' && r <= 'z') || // 小写字母
|
|
|
|
|
|
(r >= '0' && r <= '9') || // 数字
|
|
|
|
|
|
(r >= 0x0020 && r <= 0x007E) || // ASCII可打印字符(包括markdown语法字符)
|
|
|
|
|
|
(r == '\n' || r == '\r' || r == '\t') || // 换行和制表符
|
|
|
|
|
|
(r >= 0x3000 && r <= 0x303F) || // CJK符号和标点
|
|
|
|
|
|
(r >= 0xFF00 && r <= 0xFFEF) { // 全角字符
|
|
|
|
|
|
result.WriteRune(r)
|
|
|
|
|
|
} else if r > 0x007F && r < 0x00A0 {
|
|
|
|
|
|
// 无效的控制字符,替换为空格
|
|
|
|
|
|
result.WriteRune(' ')
|
|
|
|
|
|
}
|
|
|
|
|
|
// 其他字符(如乱码)直接跳过
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
return result.String()
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// StripHTML 去除HTML标签(不转换换行,直接移除标签)
|
|
|
|
|
|
func (tp *TextProcessor) StripHTML(text string) string {
|
|
|
|
|
|
// 解码HTML实体
|
|
|
|
|
|
text = html.UnescapeString(text)
|
|
|
|
|
|
|
|
|
|
|
|
// 直接移除所有HTML标签,不进行换行转换
|
|
|
|
|
|
re := regexp.MustCompile(`<[^>]+>`)
|
|
|
|
|
|
text = re.ReplaceAllString(text, "")
|
|
|
|
|
|
|
|
|
|
|
|
// 清理多余空白
|
|
|
|
|
|
text = strings.TrimSpace(text)
|
|
|
|
|
|
|
|
|
|
|
|
return text
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2026-03-13 18:07:24 +08:00
|
|
|
|
// HTMLToPlainWithBreaks 将 HTML 转为纯文本并保留富文本换行效果(<p><br><div> 等变为换行)
|
|
|
|
|
|
// 用于在 PDF 中还原段落与换行,避免内容挤成一团
|
|
|
|
|
|
func (tp *TextProcessor) HTMLToPlainWithBreaks(text string) string {
|
|
|
|
|
|
text = html.UnescapeString(text)
|
|
|
|
|
|
// 块级结束标签转为换行
|
|
|
|
|
|
text = regexp.MustCompile(`(?i)</(p|div|br|tr|li|h[1-6])>\s*`).ReplaceAllString(text, "\n")
|
|
|
|
|
|
// <br> 自闭合
|
|
|
|
|
|
text = regexp.MustCompile(`(?i)<br\s*/?>\s*`).ReplaceAllString(text, "\n")
|
|
|
|
|
|
// 剩余标签移除
|
|
|
|
|
|
text = regexp.MustCompile(`<[^>]+>`).ReplaceAllString(text, "")
|
|
|
|
|
|
// 连续空白/换行压缩为最多两个换行(段间距)
|
|
|
|
|
|
text = regexp.MustCompile(`[ \t]+`).ReplaceAllString(text, " ")
|
|
|
|
|
|
text = regexp.MustCompile(`\n{3,}`).ReplaceAllString(text, "\n\n")
|
|
|
|
|
|
return strings.TrimSpace(text)
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2026-03-16 12:32:41 +08:00
|
|
|
|
// HTMLSegment 用于 PDF 绘制的 HTML 片段:支持段落、换行、加粗、标题
|
|
|
|
|
|
type HTMLSegment struct {
|
|
|
|
|
|
Text string // 纯文本(已去标签、已解码实体)
|
|
|
|
|
|
Bold bool // 是否加粗
|
|
|
|
|
|
NewLine bool // 是否换行(如 <br>)
|
|
|
|
|
|
NewParagraph bool // 是否新段落(如 </p>、</div>)
|
|
|
|
|
|
HeadingLevel int // 1-3 表示 h1-h3,0 表示正文
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// ParseHTMLToSegments 将 HTML 解析为用于 PDF 绘制的片段序列,保留段落、换行、加粗与标题
|
|
|
|
|
|
func (tp *TextProcessor) ParseHTMLToSegments(htmlStr string) []HTMLSegment {
|
|
|
|
|
|
htmlStr = html.UnescapeString(htmlStr)
|
|
|
|
|
|
var out []HTMLSegment
|
|
|
|
|
|
blockSplit := regexp.MustCompile(`(?i)(</p>|</div>|</h[1-6]>|<br\s*/?>)\s*`)
|
|
|
|
|
|
parts := blockSplit.Split(htmlStr, -1)
|
|
|
|
|
|
tags := blockSplit.FindAllString(htmlStr, -1)
|
|
|
|
|
|
for i, block := range parts {
|
|
|
|
|
|
block = strings.TrimSpace(block)
|
|
|
|
|
|
var prevTag string
|
|
|
|
|
|
if i > 0 && i-1 < len(tags) {
|
|
|
|
|
|
prevTag = strings.ToLower(strings.TrimSpace(tags[i-1]))
|
|
|
|
|
|
}
|
|
|
|
|
|
isNewParagraph := strings.Contains(prevTag, "</p>") || strings.Contains(prevTag, "</div>") ||
|
|
|
|
|
|
strings.HasPrefix(prevTag, "</h")
|
|
|
|
|
|
isNewLine := strings.Contains(prevTag, "<br")
|
|
|
|
|
|
headingLevel := 0
|
|
|
|
|
|
if strings.HasPrefix(prevTag, "</h1") {
|
|
|
|
|
|
headingLevel = 1
|
|
|
|
|
|
} else if strings.HasPrefix(prevTag, "</h2") {
|
|
|
|
|
|
headingLevel = 2
|
|
|
|
|
|
} else if strings.HasPrefix(prevTag, "</h3") {
|
|
|
|
|
|
headingLevel = 3
|
|
|
|
|
|
}
|
|
|
|
|
|
segments := tp.parseInlineSegments(block)
|
|
|
|
|
|
// 块前先输出段落/换行/标题标记(仅在第一段文本前输出一次)
|
|
|
|
|
|
if i > 0 {
|
|
|
|
|
|
if isNewParagraph || headingLevel > 0 {
|
|
|
|
|
|
out = append(out, HTMLSegment{NewParagraph: true, HeadingLevel: headingLevel})
|
|
|
|
|
|
} else if isNewLine {
|
|
|
|
|
|
out = append(out, HTMLSegment{NewLine: true})
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
for _, seg := range segments {
|
|
|
|
|
|
if seg.Text != "" {
|
|
|
|
|
|
out = append(out, HTMLSegment{Text: seg.Text, Bold: seg.Bold, HeadingLevel: headingLevel})
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
return out
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// inlineSeg 内联片段(文本 + 是否加粗)
|
|
|
|
|
|
type inlineSeg struct {
|
|
|
|
|
|
Text string
|
|
|
|
|
|
Bold bool
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// parseInlineSegments 解析块内文本,按 <strong>/<b> 拆成片段
|
|
|
|
|
|
func (tp *TextProcessor) parseInlineSegments(block string) []inlineSeg {
|
|
|
|
|
|
var segs []inlineSeg
|
|
|
|
|
|
// 移除所有标签并收集加粗区间(按字符偏移)
|
|
|
|
|
|
reBoldOpen := regexp.MustCompile(`(?i)<(strong|b)>`)
|
|
|
|
|
|
reBoldClose := regexp.MustCompile(`(?i)</(strong|b)>`)
|
|
|
|
|
|
plain := regexp.MustCompile(`<[^>]+>`).ReplaceAllString(block, "")
|
|
|
|
|
|
plain = regexp.MustCompile(`[ \t]+`).ReplaceAllString(plain, " ")
|
|
|
|
|
|
plain = strings.TrimSpace(plain)
|
|
|
|
|
|
if plain == "" {
|
|
|
|
|
|
return segs
|
|
|
|
|
|
}
|
2026-03-16 13:10:42 +08:00
|
|
|
|
// 在 block 上找加粗区间,再映射到 plain(去掉标签后的位置)
|
|
|
|
|
|
// 注意:work 每次循环被截断,必须用相对 work 的索引切片,避免 work[:endInWork] 越界
|
2026-03-16 12:32:41 +08:00
|
|
|
|
work := block
|
|
|
|
|
|
var boldRanges [][2]int
|
2026-03-16 13:10:42 +08:00
|
|
|
|
plainOffset := 0
|
2026-03-16 12:32:41 +08:00
|
|
|
|
for {
|
|
|
|
|
|
idxOpen := reBoldOpen.FindStringIndex(work)
|
|
|
|
|
|
if idxOpen == nil {
|
|
|
|
|
|
break
|
|
|
|
|
|
}
|
|
|
|
|
|
afterOpen := work[idxOpen[1]:]
|
|
|
|
|
|
idxClose := reBoldClose.FindStringIndex(afterOpen)
|
|
|
|
|
|
if idxClose == nil {
|
|
|
|
|
|
break
|
|
|
|
|
|
}
|
2026-03-16 13:10:42 +08:00
|
|
|
|
closeLen := len(reBoldClose.FindString(afterOpen))
|
|
|
|
|
|
// 使用相对当前 work 的字节偏移,保证 work[:endInWork] 不越界
|
|
|
|
|
|
endInWork := idxOpen[1] + idxClose[0]
|
|
|
|
|
|
workBefore := work[:idxOpen[1]]
|
2026-03-16 12:32:41 +08:00
|
|
|
|
plainBefore := regexp.MustCompile(`<[^>]+>`).ReplaceAllString(workBefore, "")
|
|
|
|
|
|
plainBefore = regexp.MustCompile(`[ \t]+`).ReplaceAllString(plainBefore, " ")
|
2026-03-16 13:10:42 +08:00
|
|
|
|
startPlain := plainOffset + len([]rune(plainBefore))
|
2026-03-16 12:32:41 +08:00
|
|
|
|
workUntil := work[:endInWork]
|
|
|
|
|
|
plainUntil := regexp.MustCompile(`<[^>]+>`).ReplaceAllString(workUntil, "")
|
|
|
|
|
|
plainUntil = regexp.MustCompile(`[ \t]+`).ReplaceAllString(plainUntil, " ")
|
2026-03-16 13:10:42 +08:00
|
|
|
|
endPlain := plainOffset + len([]rune(plainUntil))
|
2026-03-16 12:32:41 +08:00
|
|
|
|
boldRanges = append(boldRanges, [2]int{startPlain, endPlain})
|
2026-03-16 13:10:42 +08:00
|
|
|
|
consumed := work[:endInWork+closeLen]
|
|
|
|
|
|
strippedConsumed := regexp.MustCompile(`<[^>]+>`).ReplaceAllString(consumed, "")
|
|
|
|
|
|
strippedConsumed = regexp.MustCompile(`[ \t]+`).ReplaceAllString(strippedConsumed, " ")
|
|
|
|
|
|
plainOffset += len([]rune(strippedConsumed))
|
|
|
|
|
|
work = work[endInWork+closeLen:]
|
2026-03-16 12:32:41 +08:00
|
|
|
|
}
|
2026-03-16 13:10:42 +08:00
|
|
|
|
// 按 boldRanges 切分 plain(限制区间在 [0,len(runes)] 内,防止越界)
|
2026-03-16 12:32:41 +08:00
|
|
|
|
runes := []rune(plain)
|
2026-03-16 13:10:42 +08:00
|
|
|
|
nr := len(runes)
|
2026-03-16 12:32:41 +08:00
|
|
|
|
inBold := false
|
|
|
|
|
|
var start int
|
2026-03-16 13:10:42 +08:00
|
|
|
|
for i := 0; i <= nr; i++ {
|
2026-03-16 12:32:41 +08:00
|
|
|
|
nowBold := false
|
|
|
|
|
|
for _, r := range boldRanges {
|
2026-03-16 13:10:42 +08:00
|
|
|
|
r0, r1 := r[0], r[1]
|
|
|
|
|
|
if r0 < 0 {
|
|
|
|
|
|
r0 = 0
|
|
|
|
|
|
}
|
|
|
|
|
|
if r1 > nr {
|
|
|
|
|
|
r1 = nr
|
|
|
|
|
|
}
|
|
|
|
|
|
if r0 < r1 && i >= r0 && i < r1 {
|
2026-03-16 12:32:41 +08:00
|
|
|
|
nowBold = true
|
|
|
|
|
|
break
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
2026-03-16 13:10:42 +08:00
|
|
|
|
if nowBold != inBold || i == nr {
|
2026-03-16 12:32:41 +08:00
|
|
|
|
if i > start {
|
|
|
|
|
|
segs = append(segs, inlineSeg{Text: string(runes[start:i]), Bold: inBold})
|
|
|
|
|
|
}
|
|
|
|
|
|
start = i
|
|
|
|
|
|
inBold = nowBold
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
if len(segs) == 0 && plain != "" {
|
|
|
|
|
|
segs = append(segs, inlineSeg{Text: plain, Bold: false})
|
|
|
|
|
|
}
|
|
|
|
|
|
return segs
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2025-12-03 12:03:42 +08:00
|
|
|
|
// RemoveMarkdownSyntax 移除markdown语法,保留纯文本
|
|
|
|
|
|
func (tp *TextProcessor) RemoveMarkdownSyntax(text string) string {
|
|
|
|
|
|
// 移除粗体标记 **text** 或 __text__
|
|
|
|
|
|
text = regexp.MustCompile(`\*\*([^*]+)\*\*`).ReplaceAllString(text, "$1")
|
|
|
|
|
|
text = regexp.MustCompile(`__([^_]+)__`).ReplaceAllString(text, "$1")
|
|
|
|
|
|
|
|
|
|
|
|
// 移除斜体标记 *text* 或 _text_
|
|
|
|
|
|
text = regexp.MustCompile(`\*([^*]+)\*`).ReplaceAllString(text, "$1")
|
|
|
|
|
|
text = regexp.MustCompile(`_([^_]+)_`).ReplaceAllString(text, "$1")
|
|
|
|
|
|
|
|
|
|
|
|
// 移除代码标记 `code`
|
|
|
|
|
|
text = regexp.MustCompile("`([^`]+)`").ReplaceAllString(text, "$1")
|
|
|
|
|
|
|
|
|
|
|
|
// 移除链接标记 [text](url) -> text
|
|
|
|
|
|
text = regexp.MustCompile(`\[([^\]]+)\]\([^\)]+\)`).ReplaceAllString(text, "$1")
|
|
|
|
|
|
|
|
|
|
|
|
// 移除图片标记  -> alt
|
|
|
|
|
|
text = regexp.MustCompile(`!\[([^\]]*)\]\([^\)]+\)`).ReplaceAllString(text, "$1")
|
|
|
|
|
|
|
|
|
|
|
|
// 移除标题标记 # text -> text
|
|
|
|
|
|
text = regexp.MustCompile(`^#{1,6}\s+(.+)$`).ReplaceAllString(text, "$1")
|
|
|
|
|
|
|
|
|
|
|
|
return text
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// RemoveNonASCII 移除非ASCII字符(保留ASCII字符和常见符号)
|
|
|
|
|
|
func (tp *TextProcessor) RemoveNonASCII(text string) string {
|
|
|
|
|
|
var result strings.Builder
|
|
|
|
|
|
for _, r := range text {
|
|
|
|
|
|
// 保留ASCII字符(0-127)
|
|
|
|
|
|
if r < 128 {
|
|
|
|
|
|
result.WriteRune(r)
|
|
|
|
|
|
} else {
|
|
|
|
|
|
// 中文字符替换为空格或跳过
|
|
|
|
|
|
result.WriteRune(' ')
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
return result.String()
|
|
|
|
|
|
}
|