f

2026-03-16 12:32:41 +08:00
parent 09db8d003e
commit 14b2c53eeb
5 changed files with 394 additions and 75 deletions
--- a/internal/shared/pdf/text_processor.go
+++ b/internal/shared/pdf/text_processor.go
@@ -106,6 +106,130 @@ func (tp *TextProcessor) HTMLToPlainWithBreaks(text string) string {
 	return strings.TrimSpace(text)
 }

+// HTMLSegment 用于 PDF 绘制的 HTML 片段：支持段落、换行、加粗、标题
+type HTMLSegment struct {
+	Text         string // 纯文本（已去标签、已解码实体）
+	Bold         bool   // 是否加粗
+	NewLine      bool   // 是否换行（如 <br>）
+	NewParagraph bool   // 是否新段落（如 </p>、</div>）
+	HeadingLevel int    // 1-3 表示 h1-h3，0 表示正文
+}
+
+// ParseHTMLToSegments 将 HTML 解析为用于 PDF 绘制的片段序列，保留段落、换行、加粗与标题
+func (tp *TextProcessor) ParseHTMLToSegments(htmlStr string) []HTMLSegment {
+	htmlStr = html.UnescapeString(htmlStr)
+	var out []HTMLSegment
+	blockSplit := regexp.MustCompile(`(?i)(</p>|</div>|</h[1-6]>|<br\s*/?>)\s*`)
+	parts := blockSplit.Split(htmlStr, -1)
+	tags := blockSplit.FindAllString(htmlStr, -1)
+	for i, block := range parts {
+		block = strings.TrimSpace(block)
+		var prevTag string
+		if i > 0 && i-1 < len(tags) {
+			prevTag = strings.ToLower(strings.TrimSpace(tags[i-1]))
+		}
+		isNewParagraph := strings.Contains(prevTag, "</p>") || strings.Contains(prevTag, "</div>") ||
+			strings.HasPrefix(prevTag, "</h")
+		isNewLine := strings.Contains(prevTag, "<br")
+		headingLevel := 0
+		if strings.HasPrefix(prevTag, "</h1") {
+			headingLevel = 1
+		} else if strings.HasPrefix(prevTag, "</h2") {
+			headingLevel = 2
+		} else if strings.HasPrefix(prevTag, "</h3") {
+			headingLevel = 3
+		}
+		segments := tp.parseInlineSegments(block)
+		// 块前先输出段落/换行/标题标记（仅在第一段文本前输出一次）
+		if i > 0 {
+			if isNewParagraph || headingLevel > 0 {
+				out = append(out, HTMLSegment{NewParagraph: true, HeadingLevel: headingLevel})
+			} else if isNewLine {
+				out = append(out, HTMLSegment{NewLine: true})
+			}
+		}
+		for _, seg := range segments {
+			if seg.Text != "" {
+				out = append(out, HTMLSegment{Text: seg.Text, Bold: seg.Bold, HeadingLevel: headingLevel})
+			}
+		}
+	}
+	return out
+}
+
+// inlineSeg 内联片段（文本 + 是否加粗）
+type inlineSeg struct {
+	Text string
+	Bold bool
+}
+
+// parseInlineSegments 解析块内文本，按 <strong>/<b> 拆成片段
+func (tp *TextProcessor) parseInlineSegments(block string) []inlineSeg {
+	var segs []inlineSeg
+	// 移除所有标签并收集加粗区间（按字符偏移）
+	reBoldOpen := regexp.MustCompile(`(?i)<(strong|b)>`)
+	reBoldClose := regexp.MustCompile(`(?i)</(strong|b)>`)
+	plain := regexp.MustCompile(`<[^>]+>`).ReplaceAllString(block, "")
+	plain = regexp.MustCompile(`[ \t]+`).ReplaceAllString(plain, " ")
+	plain = strings.TrimSpace(plain)
+	if plain == "" {
+		return segs
+	}
+	// 在原始 block 上找加粗区间，再映射到 plain（去掉标签后的位置）
+	work := block
+	var boldRanges [][2]int
+	offset := 0
+	for {
+		idxOpen := reBoldOpen.FindStringIndex(work)
+		if idxOpen == nil {
+			break
+		}
+		afterOpen := work[idxOpen[1]:]
+		idxClose := reBoldClose.FindStringIndex(afterOpen)
+		if idxClose == nil {
+			break
+		}
+		startInWork := offset + idxOpen[1]
+		endInWork := offset + idxOpen[1] + idxClose[0]
+		// 将 work 坐标映射到 plain：需要数 plain 中对应字符
+		workBefore := work[:startInWork]
+		plainBefore := regexp.MustCompile(`<[^>]+>`).ReplaceAllString(workBefore, "")
+		plainBefore = regexp.MustCompile(`[ \t]+`).ReplaceAllString(plainBefore, " ")
+		startPlain := len([]rune(plainBefore))
+		workUntil := work[:endInWork]
+		plainUntil := regexp.MustCompile(`<[^>]+>`).ReplaceAllString(workUntil, "")
+		plainUntil = regexp.MustCompile(`[ \t]+`).ReplaceAllString(plainUntil, " ")
+		endPlain := len([]rune(plainUntil))
+		boldRanges = append(boldRanges, [2]int{startPlain, endPlain})
+		work = work[endInWork+len(reBoldClose.FindString(afterOpen)):]
+		offset = endInWork + len(reBoldClose.FindString(afterOpen))
+	}
+	// 按 boldRanges 切分 plain
+	runes := []rune(plain)
+	inBold := false
+	var start int
+	for i := 0; i <= len(runes); i++ {
+		nowBold := false
+		for _, r := range boldRanges {
+			if i >= r[0] && i < r[1] {
+				nowBold = true
+				break
+			}
+		}
+		if nowBold != inBold || i == len(runes) {
+			if i > start {
+				segs = append(segs, inlineSeg{Text: string(runes[start:i]), Bold: inBold})
+			}
+			start = i
+			inBold = nowBold
+		}
+	}
+	if len(segs) == 0 && plain != "" {
+		segs = append(segs, inlineSeg{Text: plain, Bold: false})
+	}
+	return segs
+}
+
 // RemoveMarkdownSyntax 移除markdown语法，保留纯文本
 func (tp *TextProcessor) RemoveMarkdownSyntax(text string) string {
 	// 移除粗体标记 **text** 或 __text__