Files
tyapi-server/internal/shared/pdf/text_processor.go
2026-03-16 13:10:42 +08:00

284 lines
9.7 KiB
Go
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

package pdf
import (
"html"
"regexp"
"strings"
)
// TextProcessor 文本处理器
type TextProcessor struct{}
// NewTextProcessor 创建文本处理器
func NewTextProcessor() *TextProcessor {
return &TextProcessor{}
}
// CleanText 清理文本中的无效字符和乱码
func (tp *TextProcessor) CleanText(text string) string {
// 先解码HTML实体
text = html.UnescapeString(text)
// 移除或替换无效的UTF-8字符
var result strings.Builder
for _, r := range text {
// 保留:中文字符、英文字母、数字、常见标点符号、空格、换行符等
if (r >= 0x4E00 && r <= 0x9FFF) || // 中文字符范围
(r >= 0x3400 && r <= 0x4DBF) || // 扩展A
(r >= 0x20000 && r <= 0x2A6DF) || // 扩展B
(r >= 'A' && r <= 'Z') || // 大写字母
(r >= 'a' && r <= 'z') || // 小写字母
(r >= '0' && r <= '9') || // 数字
(r >= 0x0020 && r <= 0x007E) || // ASCII可打印字符
(r == '\n' || r == '\r' || r == '\t') || // 换行和制表符
(r >= 0x3000 && r <= 0x303F) || // CJK符号和标点
(r >= 0xFF00 && r <= 0xFFEF) { // 全角字符
result.WriteRune(r)
} else if r > 0x007F && r < 0x00A0 {
// 无效的控制字符,替换为空格
result.WriteRune(' ')
}
// 其他字符(如乱码)直接跳过
}
return result.String()
}
// CleanTextPreservingMarkdown 清理文本但保留markdown语法字符
func (tp *TextProcessor) CleanTextPreservingMarkdown(text string) string {
// 先解码HTML实体
text = html.UnescapeString(text)
// 移除或替换无效的UTF-8字符但保留markdown语法字符
var result strings.Builder
for _, r := range text {
// 保留:中文字符、英文字母、数字、常见标点符号、空格、换行符等
// 特别保留markdown语法字符* _ ` [ ] ( ) # - | : !
if (r >= 0x4E00 && r <= 0x9FFF) || // 中文字符范围
(r >= 0x3400 && r <= 0x4DBF) || // 扩展A
(r >= 0x20000 && r <= 0x2A6DF) || // 扩展B
(r >= 'A' && r <= 'Z') || // 大写字母
(r >= 'a' && r <= 'z') || // 小写字母
(r >= '0' && r <= '9') || // 数字
(r >= 0x0020 && r <= 0x007E) || // ASCII可打印字符包括markdown语法字符
(r == '\n' || r == '\r' || r == '\t') || // 换行和制表符
(r >= 0x3000 && r <= 0x303F) || // CJK符号和标点
(r >= 0xFF00 && r <= 0xFFEF) { // 全角字符
result.WriteRune(r)
} else if r > 0x007F && r < 0x00A0 {
// 无效的控制字符,替换为空格
result.WriteRune(' ')
}
// 其他字符(如乱码)直接跳过
}
return result.String()
}
// StripHTML 去除HTML标签不转换换行直接移除标签
func (tp *TextProcessor) StripHTML(text string) string {
// 解码HTML实体
text = html.UnescapeString(text)
// 直接移除所有HTML标签不进行换行转换
re := regexp.MustCompile(`<[^>]+>`)
text = re.ReplaceAllString(text, "")
// 清理多余空白
text = strings.TrimSpace(text)
return text
}
// HTMLToPlainWithBreaks 将 HTML 转为纯文本并保留富文本换行效果(<p><br><div> 等变为换行)
// 用于在 PDF 中还原段落与换行,避免内容挤成一团
func (tp *TextProcessor) HTMLToPlainWithBreaks(text string) string {
text = html.UnescapeString(text)
// 块级结束标签转为换行
text = regexp.MustCompile(`(?i)</(p|div|br|tr|li|h[1-6])>\s*`).ReplaceAllString(text, "\n")
// <br> 自闭合
text = regexp.MustCompile(`(?i)<br\s*/?>\s*`).ReplaceAllString(text, "\n")
// 剩余标签移除
text = regexp.MustCompile(`<[^>]+>`).ReplaceAllString(text, "")
// 连续空白/换行压缩为最多两个换行(段间距)
text = regexp.MustCompile(`[ \t]+`).ReplaceAllString(text, " ")
text = regexp.MustCompile(`\n{3,}`).ReplaceAllString(text, "\n\n")
return strings.TrimSpace(text)
}
// HTMLSegment 用于 PDF 绘制的 HTML 片段:支持段落、换行、加粗、标题
type HTMLSegment struct {
Text string // 纯文本(已去标签、已解码实体)
Bold bool // 是否加粗
NewLine bool // 是否换行(如 <br>
NewParagraph bool // 是否新段落(如 </p>、</div>
HeadingLevel int // 1-3 表示 h1-h30 表示正文
}
// ParseHTMLToSegments 将 HTML 解析为用于 PDF 绘制的片段序列,保留段落、换行、加粗与标题
func (tp *TextProcessor) ParseHTMLToSegments(htmlStr string) []HTMLSegment {
htmlStr = html.UnescapeString(htmlStr)
var out []HTMLSegment
blockSplit := regexp.MustCompile(`(?i)(</p>|</div>|</h[1-6]>|<br\s*/?>)\s*`)
parts := blockSplit.Split(htmlStr, -1)
tags := blockSplit.FindAllString(htmlStr, -1)
for i, block := range parts {
block = strings.TrimSpace(block)
var prevTag string
if i > 0 && i-1 < len(tags) {
prevTag = strings.ToLower(strings.TrimSpace(tags[i-1]))
}
isNewParagraph := strings.Contains(prevTag, "</p>") || strings.Contains(prevTag, "</div>") ||
strings.HasPrefix(prevTag, "</h")
isNewLine := strings.Contains(prevTag, "<br")
headingLevel := 0
if strings.HasPrefix(prevTag, "</h1") {
headingLevel = 1
} else if strings.HasPrefix(prevTag, "</h2") {
headingLevel = 2
} else if strings.HasPrefix(prevTag, "</h3") {
headingLevel = 3
}
segments := tp.parseInlineSegments(block)
// 块前先输出段落/换行/标题标记(仅在第一段文本前输出一次)
if i > 0 {
if isNewParagraph || headingLevel > 0 {
out = append(out, HTMLSegment{NewParagraph: true, HeadingLevel: headingLevel})
} else if isNewLine {
out = append(out, HTMLSegment{NewLine: true})
}
}
for _, seg := range segments {
if seg.Text != "" {
out = append(out, HTMLSegment{Text: seg.Text, Bold: seg.Bold, HeadingLevel: headingLevel})
}
}
}
return out
}
// inlineSeg 内联片段(文本 + 是否加粗)
type inlineSeg struct {
Text string
Bold bool
}
// parseInlineSegments 解析块内文本,按 <strong>/<b> 拆成片段
func (tp *TextProcessor) parseInlineSegments(block string) []inlineSeg {
var segs []inlineSeg
// 移除所有标签并收集加粗区间(按字符偏移)
reBoldOpen := regexp.MustCompile(`(?i)<(strong|b)>`)
reBoldClose := regexp.MustCompile(`(?i)</(strong|b)>`)
plain := regexp.MustCompile(`<[^>]+>`).ReplaceAllString(block, "")
plain = regexp.MustCompile(`[ \t]+`).ReplaceAllString(plain, " ")
plain = strings.TrimSpace(plain)
if plain == "" {
return segs
}
// 在 block 上找加粗区间,再映射到 plain去掉标签后的位置
// 注意work 每次循环被截断,必须用相对 work 的索引切片,避免 work[:endInWork] 越界
work := block
var boldRanges [][2]int
plainOffset := 0
for {
idxOpen := reBoldOpen.FindStringIndex(work)
if idxOpen == nil {
break
}
afterOpen := work[idxOpen[1]:]
idxClose := reBoldClose.FindStringIndex(afterOpen)
if idxClose == nil {
break
}
closeLen := len(reBoldClose.FindString(afterOpen))
// 使用相对当前 work 的字节偏移,保证 work[:endInWork] 不越界
endInWork := idxOpen[1] + idxClose[0]
workBefore := work[:idxOpen[1]]
plainBefore := regexp.MustCompile(`<[^>]+>`).ReplaceAllString(workBefore, "")
plainBefore = regexp.MustCompile(`[ \t]+`).ReplaceAllString(plainBefore, " ")
startPlain := plainOffset + len([]rune(plainBefore))
workUntil := work[:endInWork]
plainUntil := regexp.MustCompile(`<[^>]+>`).ReplaceAllString(workUntil, "")
plainUntil = regexp.MustCompile(`[ \t]+`).ReplaceAllString(plainUntil, " ")
endPlain := plainOffset + len([]rune(plainUntil))
boldRanges = append(boldRanges, [2]int{startPlain, endPlain})
consumed := work[:endInWork+closeLen]
strippedConsumed := regexp.MustCompile(`<[^>]+>`).ReplaceAllString(consumed, "")
strippedConsumed = regexp.MustCompile(`[ \t]+`).ReplaceAllString(strippedConsumed, " ")
plainOffset += len([]rune(strippedConsumed))
work = work[endInWork+closeLen:]
}
// 按 boldRanges 切分 plain限制区间在 [0,len(runes)] 内,防止越界)
runes := []rune(plain)
nr := len(runes)
inBold := false
var start int
for i := 0; i <= nr; i++ {
nowBold := false
for _, r := range boldRanges {
r0, r1 := r[0], r[1]
if r0 < 0 {
r0 = 0
}
if r1 > nr {
r1 = nr
}
if r0 < r1 && i >= r0 && i < r1 {
nowBold = true
break
}
}
if nowBold != inBold || i == nr {
if i > start {
segs = append(segs, inlineSeg{Text: string(runes[start:i]), Bold: inBold})
}
start = i
inBold = nowBold
}
}
if len(segs) == 0 && plain != "" {
segs = append(segs, inlineSeg{Text: plain, Bold: false})
}
return segs
}
// RemoveMarkdownSyntax 移除markdown语法保留纯文本
func (tp *TextProcessor) RemoveMarkdownSyntax(text string) string {
// 移除粗体标记 **text** 或 __text__
text = regexp.MustCompile(`\*\*([^*]+)\*\*`).ReplaceAllString(text, "$1")
text = regexp.MustCompile(`__([^_]+)__`).ReplaceAllString(text, "$1")
// 移除斜体标记 *text* 或 _text_
text = regexp.MustCompile(`\*([^*]+)\*`).ReplaceAllString(text, "$1")
text = regexp.MustCompile(`_([^_]+)_`).ReplaceAllString(text, "$1")
// 移除代码标记 `code`
text = regexp.MustCompile("`([^`]+)`").ReplaceAllString(text, "$1")
// 移除链接标记 [text](url) -> text
text = regexp.MustCompile(`\[([^\]]+)\]\([^\)]+\)`).ReplaceAllString(text, "$1")
// 移除图片标记 ![alt](url) -> alt
text = regexp.MustCompile(`!\[([^\]]*)\]\([^\)]+\)`).ReplaceAllString(text, "$1")
// 移除标题标记 # text -> text
text = regexp.MustCompile(`^#{1,6}\s+(.+)$`).ReplaceAllString(text, "$1")
return text
}
// RemoveNonASCII 移除非ASCII字符保留ASCII字符和常见符号
func (tp *TextProcessor) RemoveNonASCII(text string) string {
var result strings.Builder
for _, r := range text {
// 保留ASCII字符0-127
if r < 128 {
result.WriteRune(r)
} else {
// 中文字符替换为空格或跳过
result.WriteRune(' ')
}
}
return result.String()
}