Files
tyapi-server/internal/shared/pdf/markdown_converter.go
2025-12-03 12:03:42 +08:00

659 lines
17 KiB
Go
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

package pdf
import (
"encoding/json"
"fmt"
"regexp"
"strings"
)
// MarkdownConverter Markdown转换器 - 将各种格式的markdown内容标准化
type MarkdownConverter struct {
textProcessor *TextProcessor
}
// NewMarkdownConverter 创建Markdown转换器
func NewMarkdownConverter(textProcessor *TextProcessor) *MarkdownConverter {
return &MarkdownConverter{
textProcessor: textProcessor,
}
}
// ConvertToStandardMarkdown 将各种格式的内容转换为标准的markdown格式
// 这是第一步:预处理和标准化
func (mc *MarkdownConverter) ConvertToStandardMarkdown(content string) string {
if strings.TrimSpace(content) == "" {
return content
}
// 1. 先清理HTML标签保留内容
content = mc.textProcessor.StripHTML(content)
// 2. 处理代码块 - 确保代码块格式正确
content = mc.normalizeCodeBlocks(content)
// 3. 处理表格 - 确保表格格式正确
content = mc.normalizeTables(content)
// 4. 处理列表 - 统一列表格式
content = mc.normalizeLists(content)
// 5. 处理JSON内容 - 尝试识别并格式化JSON
content = mc.normalizeJSONContent(content)
// 6. 处理链接和图片 - 转换为文本
content = mc.convertLinksToText(content)
content = mc.convertImagesToText(content)
// 7. 处理引用块
content = mc.normalizeBlockquotes(content)
// 8. 处理水平线
content = mc.normalizeHorizontalRules(content)
// 9. 清理多余空行(保留代码块内的空行)
content = mc.cleanupExtraBlankLines(content)
return content
}
// normalizeCodeBlocks 规范化代码块
func (mc *MarkdownConverter) normalizeCodeBlocks(content string) string {
lines := strings.Split(content, "\n")
var result []string
inCodeBlock := false
codeBlockLang := ""
for i, line := range lines {
trimmed := strings.TrimSpace(line)
// 检查是否是代码块开始
if strings.HasPrefix(trimmed, "```") {
if inCodeBlock {
// 代码块结束
result = append(result, line)
inCodeBlock = false
codeBlockLang = ""
} else {
// 代码块开始
inCodeBlock = true
// 提取语言标识
if len(trimmed) > 3 {
codeBlockLang = strings.TrimSpace(trimmed[3:])
if codeBlockLang != "" {
result = append(result, fmt.Sprintf("```%s", codeBlockLang))
} else {
result = append(result, "```")
}
} else {
result = append(result, "```")
}
}
} else if inCodeBlock {
// 在代码块中,保留原样
result = append(result, line)
} else {
// 不在代码块中,处理其他内容
result = append(result, line)
}
// 如果代码块没有正确关闭,在文件末尾自动关闭
if i == len(lines)-1 && inCodeBlock {
result = append(result, "```")
}
}
return strings.Join(result, "\n")
}
// normalizeTables 规范化表格格式
func (mc *MarkdownConverter) normalizeTables(content string) string {
lines := strings.Split(content, "\n")
var result []string
inCodeBlock := false
for _, line := range lines {
trimmed := strings.TrimSpace(line)
// 检查是否在代码块中
if strings.HasPrefix(trimmed, "```") {
inCodeBlock = !inCodeBlock
result = append(result, line)
continue
}
if inCodeBlock {
// 代码块中的内容不处理
result = append(result, line)
continue
}
// 检查是否是表格行
if strings.Contains(trimmed, "|") {
// 检查是否是分隔行
isSeparator := mc.isTableSeparator(trimmed)
if isSeparator {
// 确保分隔行格式正确
cells := strings.Split(trimmed, "|")
// 清理首尾空元素
if len(cells) > 0 && cells[0] == "" {
cells = cells[1:]
}
if len(cells) > 0 && cells[len(cells)-1] == "" {
cells = cells[:len(cells)-1]
}
// 构建标准分隔行
separator := "|"
for range cells {
separator += " --- |"
}
result = append(result, separator)
} else {
// 普通表格行,确保格式正确
normalizedLine := mc.normalizeTableRow(line)
result = append(result, normalizedLine)
}
} else {
result = append(result, line)
}
}
return strings.Join(result, "\n")
}
// isTableSeparator 检查是否是表格分隔行
func (mc *MarkdownConverter) isTableSeparator(line string) bool {
trimmed := strings.TrimSpace(line)
if !strings.Contains(trimmed, "-") {
return false
}
// 检查是否只包含 |、-、:、空格
for _, r := range trimmed {
if r != '|' && r != '-' && r != ':' && r != ' ' {
return false
}
}
return true
}
// normalizeTableRow 规范化表格行
func (mc *MarkdownConverter) normalizeTableRow(line string) string {
trimmed := strings.TrimSpace(line)
if !strings.Contains(trimmed, "|") {
return line
}
cells := strings.Split(trimmed, "|")
// 清理首尾空元素
if len(cells) > 0 && cells[0] == "" {
cells = cells[1:]
}
if len(cells) > 0 && cells[len(cells)-1] == "" {
cells = cells[:len(cells)-1]
}
// 清理每个单元格
normalizedCells := make([]string, 0, len(cells))
for _, cell := range cells {
cell = strings.TrimSpace(cell)
// 移除markdown格式但保留内容
cell = mc.textProcessor.RemoveMarkdownSyntax(cell)
normalizedCells = append(normalizedCells, cell)
}
// 重新构建表格行
return "| " + strings.Join(normalizedCells, " | ") + " |"
}
// normalizeLists 规范化列表格式
func (mc *MarkdownConverter) normalizeLists(content string) string {
lines := strings.Split(content, "\n")
var result []string
inCodeBlock := false
for _, line := range lines {
trimmed := strings.TrimSpace(line)
// 检查是否在代码块中
if strings.HasPrefix(trimmed, "```") {
inCodeBlock = !inCodeBlock
result = append(result, line)
continue
}
if inCodeBlock {
result = append(result, line)
continue
}
// 处理有序列表
if matched, _ := regexp.MatchString(`^\d+\.\s+`, trimmed); matched {
// 确保格式统一:数字. 空格
re := regexp.MustCompile(`^(\d+)\.\s*`)
trimmed = re.ReplaceAllString(trimmed, "$1. ")
result = append(result, trimmed)
} else if strings.HasPrefix(trimmed, "- ") || strings.HasPrefix(trimmed, "* ") || strings.HasPrefix(trimmed, "+ ") {
// 处理无序列表,统一使用 -
re := regexp.MustCompile(`^[-*+]\s*`)
trimmed = re.ReplaceAllString(trimmed, "- ")
result = append(result, trimmed)
} else {
result = append(result, line)
}
}
return strings.Join(result, "\n")
}
// normalizeJSONContent 规范化JSON内容
func (mc *MarkdownConverter) normalizeJSONContent(content string) string {
// 尝试识别并格式化JSON代码块
jsonBlockRegex := regexp.MustCompile("(?s)```(?:json)?\\s*\n(.*?)\n```")
content = jsonBlockRegex.ReplaceAllStringFunc(content, func(match string) string {
// 提取JSON内容
submatch := jsonBlockRegex.FindStringSubmatch(match)
if len(submatch) < 2 {
return match
}
jsonStr := strings.TrimSpace(submatch[1])
// 尝试格式化JSON
var jsonObj interface{}
if err := json.Unmarshal([]byte(jsonStr), &jsonObj); err == nil {
// 格式化成功
formatted, err := json.MarshalIndent(jsonObj, "", " ")
if err == nil {
return fmt.Sprintf("```json\n%s\n```", string(formatted))
}
}
return match
})
return content
}
// convertLinksToText 将链接转换为文本
func (mc *MarkdownConverter) convertLinksToText(content string) string {
// [text](url) -> text (url)
linkRegex := regexp.MustCompile(`\[([^\]]+)\]\(([^\)]+)\)`)
content = linkRegex.ReplaceAllString(content, "$1 ($2)")
// [text][ref] -> text
refLinkRegex := regexp.MustCompile(`\[([^\]]+)\]\[[^\]]+\]`)
content = refLinkRegex.ReplaceAllString(content, "$1")
return content
}
// convertImagesToText 将图片转换为文本
func (mc *MarkdownConverter) convertImagesToText(content string) string {
// ![alt](url) -> [图片: alt]
imageRegex := regexp.MustCompile(`!\[([^\]]*)\]\([^\)]+\)`)
content = imageRegex.ReplaceAllString(content, "[图片: $1]")
return content
}
// normalizeBlockquotes 规范化引用块
func (mc *MarkdownConverter) normalizeBlockquotes(content string) string {
lines := strings.Split(content, "\n")
var result []string
inCodeBlock := false
for _, line := range lines {
trimmed := strings.TrimSpace(line)
// 检查是否在代码块中
if strings.HasPrefix(trimmed, "```") {
inCodeBlock = !inCodeBlock
result = append(result, line)
continue
}
if inCodeBlock {
result = append(result, line)
continue
}
// 处理引用块 > text -> > text
if strings.HasPrefix(trimmed, ">") {
// 确保格式统一
quoteText := strings.TrimSpace(trimmed[1:])
if quoteText != "" {
result = append(result, "> "+quoteText)
} else {
result = append(result, ">")
}
} else {
result = append(result, line)
}
}
return strings.Join(result, "\n")
}
// normalizeHorizontalRules 规范化水平线
func (mc *MarkdownConverter) normalizeHorizontalRules(content string) string {
// 统一水平线格式为 ---
hrRegex := regexp.MustCompile(`^[-*_]{3,}\s*$`)
lines := strings.Split(content, "\n")
var result []string
inCodeBlock := false
for _, line := range lines {
trimmed := strings.TrimSpace(line)
// 检查是否在代码块中
if strings.HasPrefix(trimmed, "```") {
inCodeBlock = !inCodeBlock
result = append(result, line)
continue
}
if inCodeBlock {
result = append(result, line)
continue
}
// 如果是水平线,统一格式
if hrRegex.MatchString(trimmed) {
result = append(result, "---")
} else {
result = append(result, line)
}
}
return strings.Join(result, "\n")
}
// cleanupExtraBlankLines 清理多余空行(保留代码块内的空行)
func (mc *MarkdownConverter) cleanupExtraBlankLines(content string) string {
lines := strings.Split(content, "\n")
var result []string
inCodeBlock := false
lastWasBlank := false
for _, line := range lines {
trimmed := strings.TrimSpace(line)
// 检查是否在代码块中
if strings.HasPrefix(trimmed, "```") {
inCodeBlock = !inCodeBlock
result = append(result, line)
lastWasBlank = false
continue
}
if inCodeBlock {
// 代码块中的内容全部保留
result = append(result, line)
lastWasBlank = (trimmed == "")
continue
}
// 不在代码块中
if trimmed == "" {
// 空行:最多保留一个连续空行
if !lastWasBlank {
result = append(result, "")
lastWasBlank = true
}
} else {
result = append(result, line)
lastWasBlank = false
}
}
return strings.Join(result, "\n")
}
// PreprocessContent 预处理内容 - 这是主要的转换入口
// 先转换,再解析
func (mc *MarkdownConverter) PreprocessContent(content string) string {
if strings.TrimSpace(content) == "" {
return content
}
// 第一步转换为标准markdown
content = mc.ConvertToStandardMarkdown(content)
// 第二步尝试识别并转换JSON数组为表格
content = mc.convertJSONArrayToTable(content)
// 第三步:确保所有表格都有正确的分隔行
content = mc.ensureTableSeparators(content)
return content
}
// convertJSONArrayToTable 将JSON数组转换为markdown表格
func (mc *MarkdownConverter) convertJSONArrayToTable(content string) string {
// 如果内容已经是表格格式,不处理
if strings.Contains(content, "|") {
lines := strings.Split(content, "\n")
for _, line := range lines {
trimmed := strings.TrimSpace(line)
if strings.Contains(trimmed, "|") && !strings.HasPrefix(trimmed, "```") {
// 已经有表格,不转换
return content
}
}
}
// 尝试解析为JSON数组
trimmedContent := strings.TrimSpace(content)
if strings.HasPrefix(trimmedContent, "[") {
var jsonArray []map[string]interface{}
if err := json.Unmarshal([]byte(trimmedContent), &jsonArray); err == nil && len(jsonArray) > 0 {
// 转换为markdown表格
return mc.jsonArrayToMarkdownTable(jsonArray)
}
}
// 尝试解析为JSON对象包含params或fields字段
if strings.HasPrefix(trimmedContent, "{") {
var jsonObj map[string]interface{}
if err := json.Unmarshal([]byte(trimmedContent), &jsonObj); err == nil {
// 检查是否有params字段
if params, ok := jsonObj["params"].([]interface{}); ok {
paramMaps := make([]map[string]interface{}, 0, len(params))
for _, p := range params {
if pm, ok := p.(map[string]interface{}); ok {
paramMaps = append(paramMaps, pm)
}
}
if len(paramMaps) > 0 {
return mc.jsonArrayToMarkdownTable(paramMaps)
}
}
// 检查是否有fields字段
if fields, ok := jsonObj["fields"].([]interface{}); ok {
fieldMaps := make([]map[string]interface{}, 0, len(fields))
for _, f := range fields {
if fm, ok := f.(map[string]interface{}); ok {
fieldMaps = append(fieldMaps, fm)
}
}
if len(fieldMaps) > 0 {
return mc.jsonArrayToMarkdownTable(fieldMaps)
}
}
}
}
return content
}
// jsonArrayToMarkdownTable 将JSON数组转换为markdown表格
func (mc *MarkdownConverter) jsonArrayToMarkdownTable(data []map[string]interface{}) string {
if len(data) == 0 {
return ""
}
var result strings.Builder
// 收集所有可能的列名(保持原始顺序)
// 使用map记录是否已添加使用slice保持顺序
columnSet := make(map[string]bool)
columns := make([]string, 0)
// 遍历所有数据行,按第一次出现的顺序收集列名
for _, row := range data {
for key := range row {
if !columnSet[key] {
columns = append(columns, key)
columnSet[key] = true
}
}
}
if len(columns) == 0 {
return ""
}
// 构建表头(直接使用原始列名,不做映射)
result.WriteString("|")
for _, col := range columns {
result.WriteString(" ")
result.WriteString(col) // 直接使用原始列名
result.WriteString(" |")
}
result.WriteString("\n")
// 构建分隔行
result.WriteString("|")
for range columns {
result.WriteString(" --- |")
}
result.WriteString("\n")
// 构建数据行
for _, row := range data {
result.WriteString("|")
for _, col := range columns {
result.WriteString(" ")
value := mc.formatCellValue(row[col])
result.WriteString(value)
result.WriteString(" |")
}
result.WriteString("\n")
}
return result.String()
}
// formatColumnName 格式化列名(直接返回原始列名,不做映射)
// 保持数据库原始数据的列名,不进行转换
func (mc *MarkdownConverter) formatColumnName(name string) string {
// 直接返回原始列名,保持数据库数据的原始格式
return name
}
// formatCellValue 格式化单元格值
func (mc *MarkdownConverter) formatCellValue(value interface{}) string {
if value == nil {
return ""
}
switch v := value.(type) {
case string:
v = strings.ReplaceAll(v, "\n", " ")
v = strings.ReplaceAll(v, "\r", " ")
v = strings.TrimSpace(v)
v = strings.ReplaceAll(v, "|", "\\|")
return v
case bool:
if v {
return "是"
}
return "否"
case float64:
if v == float64(int64(v)) {
return fmt.Sprintf("%.0f", v)
}
return fmt.Sprintf("%g", v)
case int, int8, int16, int32, int64:
return fmt.Sprintf("%d", v)
case uint, uint8, uint16, uint32, uint64:
return fmt.Sprintf("%d", v)
default:
str := fmt.Sprintf("%v", v)
str = strings.ReplaceAll(str, "\n", " ")
str = strings.ReplaceAll(str, "\r", " ")
str = strings.ReplaceAll(str, "|", "\\|")
return strings.TrimSpace(str)
}
}
// ensureTableSeparators 确保所有表格都有正确的分隔行
func (mc *MarkdownConverter) ensureTableSeparators(content string) string {
lines := strings.Split(content, "\n")
var result []string
inCodeBlock := false
lastLineWasTableHeader := false
for i, line := range lines {
trimmed := strings.TrimSpace(line)
// 检查是否在代码块中
if strings.HasPrefix(trimmed, "```") {
inCodeBlock = !inCodeBlock
result = append(result, line)
lastLineWasTableHeader = false
continue
}
if inCodeBlock {
result = append(result, line)
lastLineWasTableHeader = false
continue
}
// 检查是否是表格行
if strings.Contains(trimmed, "|") {
// 检查是否是分隔行
if mc.isTableSeparator(trimmed) {
result = append(result, line)
lastLineWasTableHeader = false
} else {
// 普通表格行
result = append(result, line)
// 检查上一行是否是表头
if lastLineWasTableHeader {
// 在表头后插入分隔行
cells := strings.Split(trimmed, "|")
if len(cells) > 0 && cells[0] == "" {
cells = cells[1:]
}
if len(cells) > 0 && cells[len(cells)-1] == "" {
cells = cells[:len(cells)-1]
}
separator := "|"
for range cells {
separator += " --- |"
}
// 在当前位置插入分隔行
result = append(result[:len(result)-1], separator, line)
} else {
// 检查是否是表头(第一行表格)
if i > 0 {
prevLine := strings.TrimSpace(lines[i-1])
if !strings.Contains(prevLine, "|") || mc.isTableSeparator(prevLine) {
// 这可能是表头
lastLineWasTableHeader = true
}
} else {
lastLineWasTableHeader = true
}
}
}
} else {
result = append(result, line)
lastLineWasTableHeader = false
}
}
return strings.Join(result, "\n")
}