659 lines
17 KiB
Go
659 lines
17 KiB
Go
package pdf
|
||
|
||
import (
|
||
"encoding/json"
|
||
"fmt"
|
||
"regexp"
|
||
"strings"
|
||
)
|
||
|
||
// MarkdownConverter Markdown转换器 - 将各种格式的markdown内容标准化
|
||
type MarkdownConverter struct {
|
||
textProcessor *TextProcessor
|
||
}
|
||
|
||
// NewMarkdownConverter 创建Markdown转换器
|
||
func NewMarkdownConverter(textProcessor *TextProcessor) *MarkdownConverter {
|
||
return &MarkdownConverter{
|
||
textProcessor: textProcessor,
|
||
}
|
||
}
|
||
|
||
// ConvertToStandardMarkdown 将各种格式的内容转换为标准的markdown格式
|
||
// 这是第一步:预处理和标准化
|
||
func (mc *MarkdownConverter) ConvertToStandardMarkdown(content string) string {
|
||
if strings.TrimSpace(content) == "" {
|
||
return content
|
||
}
|
||
|
||
// 1. 先清理HTML标签(保留内容)
|
||
content = mc.textProcessor.StripHTML(content)
|
||
|
||
// 2. 处理代码块 - 确保代码块格式正确
|
||
content = mc.normalizeCodeBlocks(content)
|
||
|
||
// 3. 处理表格 - 确保表格格式正确
|
||
content = mc.normalizeTables(content)
|
||
|
||
// 4. 处理列表 - 统一列表格式
|
||
content = mc.normalizeLists(content)
|
||
|
||
// 5. 处理JSON内容 - 尝试识别并格式化JSON
|
||
content = mc.normalizeJSONContent(content)
|
||
|
||
// 6. 处理链接和图片 - 转换为文本
|
||
content = mc.convertLinksToText(content)
|
||
content = mc.convertImagesToText(content)
|
||
|
||
// 7. 处理引用块
|
||
content = mc.normalizeBlockquotes(content)
|
||
|
||
// 8. 处理水平线
|
||
content = mc.normalizeHorizontalRules(content)
|
||
|
||
// 9. 清理多余空行(保留代码块内的空行)
|
||
content = mc.cleanupExtraBlankLines(content)
|
||
|
||
return content
|
||
}
|
||
|
||
// normalizeCodeBlocks 规范化代码块
|
||
func (mc *MarkdownConverter) normalizeCodeBlocks(content string) string {
|
||
lines := strings.Split(content, "\n")
|
||
var result []string
|
||
inCodeBlock := false
|
||
codeBlockLang := ""
|
||
|
||
for i, line := range lines {
|
||
trimmed := strings.TrimSpace(line)
|
||
|
||
// 检查是否是代码块开始
|
||
if strings.HasPrefix(trimmed, "```") {
|
||
if inCodeBlock {
|
||
// 代码块结束
|
||
result = append(result, line)
|
||
inCodeBlock = false
|
||
codeBlockLang = ""
|
||
} else {
|
||
// 代码块开始
|
||
inCodeBlock = true
|
||
// 提取语言标识
|
||
if len(trimmed) > 3 {
|
||
codeBlockLang = strings.TrimSpace(trimmed[3:])
|
||
if codeBlockLang != "" {
|
||
result = append(result, fmt.Sprintf("```%s", codeBlockLang))
|
||
} else {
|
||
result = append(result, "```")
|
||
}
|
||
} else {
|
||
result = append(result, "```")
|
||
}
|
||
}
|
||
} else if inCodeBlock {
|
||
// 在代码块中,保留原样
|
||
result = append(result, line)
|
||
} else {
|
||
// 不在代码块中,处理其他内容
|
||
result = append(result, line)
|
||
}
|
||
|
||
// 如果代码块没有正确关闭,在文件末尾自动关闭
|
||
if i == len(lines)-1 && inCodeBlock {
|
||
result = append(result, "```")
|
||
}
|
||
}
|
||
|
||
return strings.Join(result, "\n")
|
||
}
|
||
|
||
// normalizeTables 规范化表格格式
|
||
func (mc *MarkdownConverter) normalizeTables(content string) string {
|
||
lines := strings.Split(content, "\n")
|
||
var result []string
|
||
inCodeBlock := false
|
||
|
||
for _, line := range lines {
|
||
trimmed := strings.TrimSpace(line)
|
||
|
||
// 检查是否在代码块中
|
||
if strings.HasPrefix(trimmed, "```") {
|
||
inCodeBlock = !inCodeBlock
|
||
result = append(result, line)
|
||
continue
|
||
}
|
||
|
||
if inCodeBlock {
|
||
// 代码块中的内容不处理
|
||
result = append(result, line)
|
||
continue
|
||
}
|
||
|
||
// 检查是否是表格行
|
||
if strings.Contains(trimmed, "|") {
|
||
// 检查是否是分隔行
|
||
isSeparator := mc.isTableSeparator(trimmed)
|
||
if isSeparator {
|
||
// 确保分隔行格式正确
|
||
cells := strings.Split(trimmed, "|")
|
||
// 清理首尾空元素
|
||
if len(cells) > 0 && cells[0] == "" {
|
||
cells = cells[1:]
|
||
}
|
||
if len(cells) > 0 && cells[len(cells)-1] == "" {
|
||
cells = cells[:len(cells)-1]
|
||
}
|
||
// 构建标准分隔行
|
||
separator := "|"
|
||
for range cells {
|
||
separator += " --- |"
|
||
}
|
||
result = append(result, separator)
|
||
} else {
|
||
// 普通表格行,确保格式正确
|
||
normalizedLine := mc.normalizeTableRow(line)
|
||
result = append(result, normalizedLine)
|
||
}
|
||
} else {
|
||
result = append(result, line)
|
||
}
|
||
}
|
||
|
||
return strings.Join(result, "\n")
|
||
}
|
||
|
||
// isTableSeparator 检查是否是表格分隔行
|
||
func (mc *MarkdownConverter) isTableSeparator(line string) bool {
|
||
trimmed := strings.TrimSpace(line)
|
||
if !strings.Contains(trimmed, "-") {
|
||
return false
|
||
}
|
||
|
||
// 检查是否只包含 |、-、:、空格
|
||
for _, r := range trimmed {
|
||
if r != '|' && r != '-' && r != ':' && r != ' ' {
|
||
return false
|
||
}
|
||
}
|
||
return true
|
||
}
|
||
|
||
// normalizeTableRow 规范化表格行
|
||
func (mc *MarkdownConverter) normalizeTableRow(line string) string {
|
||
trimmed := strings.TrimSpace(line)
|
||
if !strings.Contains(trimmed, "|") {
|
||
return line
|
||
}
|
||
|
||
cells := strings.Split(trimmed, "|")
|
||
// 清理首尾空元素
|
||
if len(cells) > 0 && cells[0] == "" {
|
||
cells = cells[1:]
|
||
}
|
||
if len(cells) > 0 && cells[len(cells)-1] == "" {
|
||
cells = cells[:len(cells)-1]
|
||
}
|
||
|
||
// 清理每个单元格
|
||
normalizedCells := make([]string, 0, len(cells))
|
||
for _, cell := range cells {
|
||
cell = strings.TrimSpace(cell)
|
||
// 移除markdown格式但保留内容
|
||
cell = mc.textProcessor.RemoveMarkdownSyntax(cell)
|
||
normalizedCells = append(normalizedCells, cell)
|
||
}
|
||
|
||
// 重新构建表格行
|
||
return "| " + strings.Join(normalizedCells, " | ") + " |"
|
||
}
|
||
|
||
// normalizeLists 规范化列表格式
|
||
func (mc *MarkdownConverter) normalizeLists(content string) string {
|
||
lines := strings.Split(content, "\n")
|
||
var result []string
|
||
inCodeBlock := false
|
||
|
||
for _, line := range lines {
|
||
trimmed := strings.TrimSpace(line)
|
||
|
||
// 检查是否在代码块中
|
||
if strings.HasPrefix(trimmed, "```") {
|
||
inCodeBlock = !inCodeBlock
|
||
result = append(result, line)
|
||
continue
|
||
}
|
||
|
||
if inCodeBlock {
|
||
result = append(result, line)
|
||
continue
|
||
}
|
||
|
||
// 处理有序列表
|
||
if matched, _ := regexp.MatchString(`^\d+\.\s+`, trimmed); matched {
|
||
// 确保格式统一:数字. 空格
|
||
re := regexp.MustCompile(`^(\d+)\.\s*`)
|
||
trimmed = re.ReplaceAllString(trimmed, "$1. ")
|
||
result = append(result, trimmed)
|
||
} else if strings.HasPrefix(trimmed, "- ") || strings.HasPrefix(trimmed, "* ") || strings.HasPrefix(trimmed, "+ ") {
|
||
// 处理无序列表,统一使用 -
|
||
re := regexp.MustCompile(`^[-*+]\s*`)
|
||
trimmed = re.ReplaceAllString(trimmed, "- ")
|
||
result = append(result, trimmed)
|
||
} else {
|
||
result = append(result, line)
|
||
}
|
||
}
|
||
|
||
return strings.Join(result, "\n")
|
||
}
|
||
|
||
// normalizeJSONContent 规范化JSON内容
|
||
func (mc *MarkdownConverter) normalizeJSONContent(content string) string {
|
||
// 尝试识别并格式化JSON代码块
|
||
jsonBlockRegex := regexp.MustCompile("(?s)```(?:json)?\\s*\n(.*?)\n```")
|
||
content = jsonBlockRegex.ReplaceAllStringFunc(content, func(match string) string {
|
||
// 提取JSON内容
|
||
submatch := jsonBlockRegex.FindStringSubmatch(match)
|
||
if len(submatch) < 2 {
|
||
return match
|
||
}
|
||
|
||
jsonStr := strings.TrimSpace(submatch[1])
|
||
// 尝试格式化JSON
|
||
var jsonObj interface{}
|
||
if err := json.Unmarshal([]byte(jsonStr), &jsonObj); err == nil {
|
||
// 格式化成功
|
||
formatted, err := json.MarshalIndent(jsonObj, "", " ")
|
||
if err == nil {
|
||
return fmt.Sprintf("```json\n%s\n```", string(formatted))
|
||
}
|
||
}
|
||
return match
|
||
})
|
||
|
||
return content
|
||
}
|
||
|
||
// convertLinksToText 将链接转换为文本
|
||
func (mc *MarkdownConverter) convertLinksToText(content string) string {
|
||
// [text](url) -> text (url)
|
||
linkRegex := regexp.MustCompile(`\[([^\]]+)\]\(([^\)]+)\)`)
|
||
content = linkRegex.ReplaceAllString(content, "$1 ($2)")
|
||
|
||
// [text][ref] -> text
|
||
refLinkRegex := regexp.MustCompile(`\[([^\]]+)\]\[[^\]]+\]`)
|
||
content = refLinkRegex.ReplaceAllString(content, "$1")
|
||
|
||
return content
|
||
}
|
||
|
||
// convertImagesToText 将图片转换为文本
|
||
func (mc *MarkdownConverter) convertImagesToText(content string) string {
|
||
//  -> [图片: alt]
|
||
imageRegex := regexp.MustCompile(`!\[([^\]]*)\]\([^\)]+\)`)
|
||
content = imageRegex.ReplaceAllString(content, "[图片: $1]")
|
||
|
||
return content
|
||
}
|
||
|
||
// normalizeBlockquotes 规范化引用块
|
||
func (mc *MarkdownConverter) normalizeBlockquotes(content string) string {
|
||
lines := strings.Split(content, "\n")
|
||
var result []string
|
||
inCodeBlock := false
|
||
|
||
for _, line := range lines {
|
||
trimmed := strings.TrimSpace(line)
|
||
|
||
// 检查是否在代码块中
|
||
if strings.HasPrefix(trimmed, "```") {
|
||
inCodeBlock = !inCodeBlock
|
||
result = append(result, line)
|
||
continue
|
||
}
|
||
|
||
if inCodeBlock {
|
||
result = append(result, line)
|
||
continue
|
||
}
|
||
|
||
// 处理引用块 > text -> > text
|
||
if strings.HasPrefix(trimmed, ">") {
|
||
// 确保格式统一
|
||
quoteText := strings.TrimSpace(trimmed[1:])
|
||
if quoteText != "" {
|
||
result = append(result, "> "+quoteText)
|
||
} else {
|
||
result = append(result, ">")
|
||
}
|
||
} else {
|
||
result = append(result, line)
|
||
}
|
||
}
|
||
|
||
return strings.Join(result, "\n")
|
||
}
|
||
|
||
// normalizeHorizontalRules 规范化水平线
|
||
func (mc *MarkdownConverter) normalizeHorizontalRules(content string) string {
|
||
// 统一水平线格式为 ---
|
||
hrRegex := regexp.MustCompile(`^[-*_]{3,}\s*$`)
|
||
lines := strings.Split(content, "\n")
|
||
var result []string
|
||
inCodeBlock := false
|
||
|
||
for _, line := range lines {
|
||
trimmed := strings.TrimSpace(line)
|
||
|
||
// 检查是否在代码块中
|
||
if strings.HasPrefix(trimmed, "```") {
|
||
inCodeBlock = !inCodeBlock
|
||
result = append(result, line)
|
||
continue
|
||
}
|
||
|
||
if inCodeBlock {
|
||
result = append(result, line)
|
||
continue
|
||
}
|
||
|
||
// 如果是水平线,统一格式
|
||
if hrRegex.MatchString(trimmed) {
|
||
result = append(result, "---")
|
||
} else {
|
||
result = append(result, line)
|
||
}
|
||
}
|
||
|
||
return strings.Join(result, "\n")
|
||
}
|
||
|
||
// cleanupExtraBlankLines 清理多余空行(保留代码块内的空行)
|
||
func (mc *MarkdownConverter) cleanupExtraBlankLines(content string) string {
|
||
lines := strings.Split(content, "\n")
|
||
var result []string
|
||
inCodeBlock := false
|
||
lastWasBlank := false
|
||
|
||
for _, line := range lines {
|
||
trimmed := strings.TrimSpace(line)
|
||
|
||
// 检查是否在代码块中
|
||
if strings.HasPrefix(trimmed, "```") {
|
||
inCodeBlock = !inCodeBlock
|
||
result = append(result, line)
|
||
lastWasBlank = false
|
||
continue
|
||
}
|
||
|
||
if inCodeBlock {
|
||
// 代码块中的内容全部保留
|
||
result = append(result, line)
|
||
lastWasBlank = (trimmed == "")
|
||
continue
|
||
}
|
||
|
||
// 不在代码块中
|
||
if trimmed == "" {
|
||
// 空行:最多保留一个连续空行
|
||
if !lastWasBlank {
|
||
result = append(result, "")
|
||
lastWasBlank = true
|
||
}
|
||
} else {
|
||
result = append(result, line)
|
||
lastWasBlank = false
|
||
}
|
||
}
|
||
|
||
return strings.Join(result, "\n")
|
||
}
|
||
|
||
// PreprocessContent 预处理内容 - 这是主要的转换入口
|
||
// 先转换,再解析
|
||
func (mc *MarkdownConverter) PreprocessContent(content string) string {
|
||
if strings.TrimSpace(content) == "" {
|
||
return content
|
||
}
|
||
|
||
// 第一步:转换为标准markdown
|
||
content = mc.ConvertToStandardMarkdown(content)
|
||
|
||
// 第二步:尝试识别并转换JSON数组为表格
|
||
content = mc.convertJSONArrayToTable(content)
|
||
|
||
// 第三步:确保所有表格都有正确的分隔行
|
||
content = mc.ensureTableSeparators(content)
|
||
|
||
return content
|
||
}
|
||
|
||
// convertJSONArrayToTable 将JSON数组转换为markdown表格
|
||
func (mc *MarkdownConverter) convertJSONArrayToTable(content string) string {
|
||
// 如果内容已经是表格格式,不处理
|
||
if strings.Contains(content, "|") {
|
||
lines := strings.Split(content, "\n")
|
||
for _, line := range lines {
|
||
trimmed := strings.TrimSpace(line)
|
||
if strings.Contains(trimmed, "|") && !strings.HasPrefix(trimmed, "```") {
|
||
// 已经有表格,不转换
|
||
return content
|
||
}
|
||
}
|
||
}
|
||
|
||
// 尝试解析为JSON数组
|
||
trimmedContent := strings.TrimSpace(content)
|
||
if strings.HasPrefix(trimmedContent, "[") {
|
||
var jsonArray []map[string]interface{}
|
||
if err := json.Unmarshal([]byte(trimmedContent), &jsonArray); err == nil && len(jsonArray) > 0 {
|
||
// 转换为markdown表格
|
||
return mc.jsonArrayToMarkdownTable(jsonArray)
|
||
}
|
||
}
|
||
|
||
// 尝试解析为JSON对象(包含params或fields字段)
|
||
if strings.HasPrefix(trimmedContent, "{") {
|
||
var jsonObj map[string]interface{}
|
||
if err := json.Unmarshal([]byte(trimmedContent), &jsonObj); err == nil {
|
||
// 检查是否有params字段
|
||
if params, ok := jsonObj["params"].([]interface{}); ok {
|
||
paramMaps := make([]map[string]interface{}, 0, len(params))
|
||
for _, p := range params {
|
||
if pm, ok := p.(map[string]interface{}); ok {
|
||
paramMaps = append(paramMaps, pm)
|
||
}
|
||
}
|
||
if len(paramMaps) > 0 {
|
||
return mc.jsonArrayToMarkdownTable(paramMaps)
|
||
}
|
||
}
|
||
// 检查是否有fields字段
|
||
if fields, ok := jsonObj["fields"].([]interface{}); ok {
|
||
fieldMaps := make([]map[string]interface{}, 0, len(fields))
|
||
for _, f := range fields {
|
||
if fm, ok := f.(map[string]interface{}); ok {
|
||
fieldMaps = append(fieldMaps, fm)
|
||
}
|
||
}
|
||
if len(fieldMaps) > 0 {
|
||
return mc.jsonArrayToMarkdownTable(fieldMaps)
|
||
}
|
||
}
|
||
}
|
||
}
|
||
|
||
return content
|
||
}
|
||
|
||
// jsonArrayToMarkdownTable 将JSON数组转换为markdown表格
|
||
func (mc *MarkdownConverter) jsonArrayToMarkdownTable(data []map[string]interface{}) string {
|
||
if len(data) == 0 {
|
||
return ""
|
||
}
|
||
|
||
var result strings.Builder
|
||
|
||
// 收集所有可能的列名(保持原始顺序)
|
||
// 使用map记录是否已添加,使用slice保持顺序
|
||
columnSet := make(map[string]bool)
|
||
columns := make([]string, 0)
|
||
|
||
// 遍历所有数据行,按第一次出现的顺序收集列名
|
||
for _, row := range data {
|
||
for key := range row {
|
||
if !columnSet[key] {
|
||
columns = append(columns, key)
|
||
columnSet[key] = true
|
||
}
|
||
}
|
||
}
|
||
|
||
if len(columns) == 0 {
|
||
return ""
|
||
}
|
||
|
||
// 构建表头(直接使用原始列名,不做映射)
|
||
result.WriteString("|")
|
||
for _, col := range columns {
|
||
result.WriteString(" ")
|
||
result.WriteString(col) // 直接使用原始列名
|
||
result.WriteString(" |")
|
||
}
|
||
result.WriteString("\n")
|
||
|
||
// 构建分隔行
|
||
result.WriteString("|")
|
||
for range columns {
|
||
result.WriteString(" --- |")
|
||
}
|
||
result.WriteString("\n")
|
||
|
||
// 构建数据行
|
||
for _, row := range data {
|
||
result.WriteString("|")
|
||
for _, col := range columns {
|
||
result.WriteString(" ")
|
||
value := mc.formatCellValue(row[col])
|
||
result.WriteString(value)
|
||
result.WriteString(" |")
|
||
}
|
||
result.WriteString("\n")
|
||
}
|
||
|
||
return result.String()
|
||
}
|
||
|
||
// formatColumnName 格式化列名(直接返回原始列名,不做映射)
|
||
// 保持数据库原始数据的列名,不进行转换
|
||
func (mc *MarkdownConverter) formatColumnName(name string) string {
|
||
// 直接返回原始列名,保持数据库数据的原始格式
|
||
return name
|
||
}
|
||
|
||
// formatCellValue 格式化单元格值
|
||
func (mc *MarkdownConverter) formatCellValue(value interface{}) string {
|
||
if value == nil {
|
||
return ""
|
||
}
|
||
|
||
switch v := value.(type) {
|
||
case string:
|
||
v = strings.ReplaceAll(v, "\n", " ")
|
||
v = strings.ReplaceAll(v, "\r", " ")
|
||
v = strings.TrimSpace(v)
|
||
v = strings.ReplaceAll(v, "|", "\\|")
|
||
return v
|
||
case bool:
|
||
if v {
|
||
return "是"
|
||
}
|
||
return "否"
|
||
case float64:
|
||
if v == float64(int64(v)) {
|
||
return fmt.Sprintf("%.0f", v)
|
||
}
|
||
return fmt.Sprintf("%g", v)
|
||
case int, int8, int16, int32, int64:
|
||
return fmt.Sprintf("%d", v)
|
||
case uint, uint8, uint16, uint32, uint64:
|
||
return fmt.Sprintf("%d", v)
|
||
default:
|
||
str := fmt.Sprintf("%v", v)
|
||
str = strings.ReplaceAll(str, "\n", " ")
|
||
str = strings.ReplaceAll(str, "\r", " ")
|
||
str = strings.ReplaceAll(str, "|", "\\|")
|
||
return strings.TrimSpace(str)
|
||
}
|
||
}
|
||
|
||
// ensureTableSeparators 确保所有表格都有正确的分隔行
|
||
func (mc *MarkdownConverter) ensureTableSeparators(content string) string {
|
||
lines := strings.Split(content, "\n")
|
||
var result []string
|
||
inCodeBlock := false
|
||
lastLineWasTableHeader := false
|
||
|
||
for i, line := range lines {
|
||
trimmed := strings.TrimSpace(line)
|
||
|
||
// 检查是否在代码块中
|
||
if strings.HasPrefix(trimmed, "```") {
|
||
inCodeBlock = !inCodeBlock
|
||
result = append(result, line)
|
||
lastLineWasTableHeader = false
|
||
continue
|
||
}
|
||
|
||
if inCodeBlock {
|
||
result = append(result, line)
|
||
lastLineWasTableHeader = false
|
||
continue
|
||
}
|
||
|
||
// 检查是否是表格行
|
||
if strings.Contains(trimmed, "|") {
|
||
// 检查是否是分隔行
|
||
if mc.isTableSeparator(trimmed) {
|
||
result = append(result, line)
|
||
lastLineWasTableHeader = false
|
||
} else {
|
||
// 普通表格行
|
||
result = append(result, line)
|
||
// 检查上一行是否是表头
|
||
if lastLineWasTableHeader {
|
||
// 在表头后插入分隔行
|
||
cells := strings.Split(trimmed, "|")
|
||
if len(cells) > 0 && cells[0] == "" {
|
||
cells = cells[1:]
|
||
}
|
||
if len(cells) > 0 && cells[len(cells)-1] == "" {
|
||
cells = cells[:len(cells)-1]
|
||
}
|
||
separator := "|"
|
||
for range cells {
|
||
separator += " --- |"
|
||
}
|
||
// 在当前位置插入分隔行
|
||
result = append(result[:len(result)-1], separator, line)
|
||
} else {
|
||
// 检查是否是表头(第一行表格)
|
||
if i > 0 {
|
||
prevLine := strings.TrimSpace(lines[i-1])
|
||
if !strings.Contains(prevLine, "|") || mc.isTableSeparator(prevLine) {
|
||
// 这可能是表头
|
||
lastLineWasTableHeader = true
|
||
}
|
||
} else {
|
||
lastLineWasTableHeader = true
|
||
}
|
||
}
|
||
}
|
||
} else {
|
||
result = append(result, line)
|
||
lastLineWasTableHeader = false
|
||
}
|
||
}
|
||
|
||
return strings.Join(result, "\n")
|
||
}
|