f
This commit is contained in:
@@ -106,6 +106,130 @@ func (tp *TextProcessor) HTMLToPlainWithBreaks(text string) string {
|
||||
return strings.TrimSpace(text)
|
||||
}
|
||||
|
||||
// HTMLSegment 用于 PDF 绘制的 HTML 片段:支持段落、换行、加粗、标题
|
||||
type HTMLSegment struct {
|
||||
Text string // 纯文本(已去标签、已解码实体)
|
||||
Bold bool // 是否加粗
|
||||
NewLine bool // 是否换行(如 <br>)
|
||||
NewParagraph bool // 是否新段落(如 </p>、</div>)
|
||||
HeadingLevel int // 1-3 表示 h1-h3,0 表示正文
|
||||
}
|
||||
|
||||
// ParseHTMLToSegments 将 HTML 解析为用于 PDF 绘制的片段序列,保留段落、换行、加粗与标题
|
||||
func (tp *TextProcessor) ParseHTMLToSegments(htmlStr string) []HTMLSegment {
|
||||
htmlStr = html.UnescapeString(htmlStr)
|
||||
var out []HTMLSegment
|
||||
blockSplit := regexp.MustCompile(`(?i)(</p>|</div>|</h[1-6]>|<br\s*/?>)\s*`)
|
||||
parts := blockSplit.Split(htmlStr, -1)
|
||||
tags := blockSplit.FindAllString(htmlStr, -1)
|
||||
for i, block := range parts {
|
||||
block = strings.TrimSpace(block)
|
||||
var prevTag string
|
||||
if i > 0 && i-1 < len(tags) {
|
||||
prevTag = strings.ToLower(strings.TrimSpace(tags[i-1]))
|
||||
}
|
||||
isNewParagraph := strings.Contains(prevTag, "</p>") || strings.Contains(prevTag, "</div>") ||
|
||||
strings.HasPrefix(prevTag, "</h")
|
||||
isNewLine := strings.Contains(prevTag, "<br")
|
||||
headingLevel := 0
|
||||
if strings.HasPrefix(prevTag, "</h1") {
|
||||
headingLevel = 1
|
||||
} else if strings.HasPrefix(prevTag, "</h2") {
|
||||
headingLevel = 2
|
||||
} else if strings.HasPrefix(prevTag, "</h3") {
|
||||
headingLevel = 3
|
||||
}
|
||||
segments := tp.parseInlineSegments(block)
|
||||
// 块前先输出段落/换行/标题标记(仅在第一段文本前输出一次)
|
||||
if i > 0 {
|
||||
if isNewParagraph || headingLevel > 0 {
|
||||
out = append(out, HTMLSegment{NewParagraph: true, HeadingLevel: headingLevel})
|
||||
} else if isNewLine {
|
||||
out = append(out, HTMLSegment{NewLine: true})
|
||||
}
|
||||
}
|
||||
for _, seg := range segments {
|
||||
if seg.Text != "" {
|
||||
out = append(out, HTMLSegment{Text: seg.Text, Bold: seg.Bold, HeadingLevel: headingLevel})
|
||||
}
|
||||
}
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
// inlineSeg 内联片段(文本 + 是否加粗)
|
||||
type inlineSeg struct {
|
||||
Text string
|
||||
Bold bool
|
||||
}
|
||||
|
||||
// parseInlineSegments 解析块内文本,按 <strong>/<b> 拆成片段
|
||||
func (tp *TextProcessor) parseInlineSegments(block string) []inlineSeg {
|
||||
var segs []inlineSeg
|
||||
// 移除所有标签并收集加粗区间(按字符偏移)
|
||||
reBoldOpen := regexp.MustCompile(`(?i)<(strong|b)>`)
|
||||
reBoldClose := regexp.MustCompile(`(?i)</(strong|b)>`)
|
||||
plain := regexp.MustCompile(`<[^>]+>`).ReplaceAllString(block, "")
|
||||
plain = regexp.MustCompile(`[ \t]+`).ReplaceAllString(plain, " ")
|
||||
plain = strings.TrimSpace(plain)
|
||||
if plain == "" {
|
||||
return segs
|
||||
}
|
||||
// 在原始 block 上找加粗区间,再映射到 plain(去掉标签后的位置)
|
||||
work := block
|
||||
var boldRanges [][2]int
|
||||
offset := 0
|
||||
for {
|
||||
idxOpen := reBoldOpen.FindStringIndex(work)
|
||||
if idxOpen == nil {
|
||||
break
|
||||
}
|
||||
afterOpen := work[idxOpen[1]:]
|
||||
idxClose := reBoldClose.FindStringIndex(afterOpen)
|
||||
if idxClose == nil {
|
||||
break
|
||||
}
|
||||
startInWork := offset + idxOpen[1]
|
||||
endInWork := offset + idxOpen[1] + idxClose[0]
|
||||
// 将 work 坐标映射到 plain:需要数 plain 中对应字符
|
||||
workBefore := work[:startInWork]
|
||||
plainBefore := regexp.MustCompile(`<[^>]+>`).ReplaceAllString(workBefore, "")
|
||||
plainBefore = regexp.MustCompile(`[ \t]+`).ReplaceAllString(plainBefore, " ")
|
||||
startPlain := len([]rune(plainBefore))
|
||||
workUntil := work[:endInWork]
|
||||
plainUntil := regexp.MustCompile(`<[^>]+>`).ReplaceAllString(workUntil, "")
|
||||
plainUntil = regexp.MustCompile(`[ \t]+`).ReplaceAllString(plainUntil, " ")
|
||||
endPlain := len([]rune(plainUntil))
|
||||
boldRanges = append(boldRanges, [2]int{startPlain, endPlain})
|
||||
work = work[endInWork+len(reBoldClose.FindString(afterOpen)):]
|
||||
offset = endInWork + len(reBoldClose.FindString(afterOpen))
|
||||
}
|
||||
// 按 boldRanges 切分 plain
|
||||
runes := []rune(plain)
|
||||
inBold := false
|
||||
var start int
|
||||
for i := 0; i <= len(runes); i++ {
|
||||
nowBold := false
|
||||
for _, r := range boldRanges {
|
||||
if i >= r[0] && i < r[1] {
|
||||
nowBold = true
|
||||
break
|
||||
}
|
||||
}
|
||||
if nowBold != inBold || i == len(runes) {
|
||||
if i > start {
|
||||
segs = append(segs, inlineSeg{Text: string(runes[start:i]), Bold: inBold})
|
||||
}
|
||||
start = i
|
||||
inBold = nowBold
|
||||
}
|
||||
}
|
||||
if len(segs) == 0 && plain != "" {
|
||||
segs = append(segs, inlineSeg{Text: plain, Bold: false})
|
||||
}
|
||||
return segs
|
||||
}
|
||||
|
||||
// RemoveMarkdownSyntax 移除markdown语法,保留纯文本
|
||||
func (tp *TextProcessor) RemoveMarkdownSyntax(text string) string {
|
||||
// 移除粗体标记 **text** 或 __text__
|
||||
|
||||
Reference in New Issue
Block a user