Files
tyapi-server/internal/shared/pdf/markdown_processor.go
2025-12-03 12:03:42 +08:00

356 lines
10 KiB
Go
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

package pdf
import (
"encoding/json"
"fmt"
"regexp"
"strings"
)
// MarkdownProcessor Markdown处理器
type MarkdownProcessor struct {
textProcessor *TextProcessor
markdownConverter *MarkdownConverter
}
// NewMarkdownProcessor 创建Markdown处理器
func NewMarkdownProcessor(textProcessor *TextProcessor) *MarkdownProcessor {
converter := NewMarkdownConverter(textProcessor)
return &MarkdownProcessor{
textProcessor: textProcessor,
markdownConverter: converter,
}
}
// MarkdownSection 表示一个markdown章节
type MarkdownSection struct {
Title string // 标题(包含#号)
Level int // 标题级别(## 是2, ### 是3, #### 是4
Content string // 该章节的内容
}
// SplitByMarkdownHeaders 按markdown标题分割内容
func (mp *MarkdownProcessor) SplitByMarkdownHeaders(content string) []MarkdownSection {
lines := strings.Split(content, "\n")
var sections []MarkdownSection
var currentSection MarkdownSection
var currentContent []string
// 标题正则:匹配 #, ##, ###, #### 等
headerRegex := regexp.MustCompile(`^(#{1,6})\s+(.+)$`)
for _, line := range lines {
trimmedLine := strings.TrimSpace(line)
// 检查是否是标题行
if matches := headerRegex.FindStringSubmatch(trimmedLine); matches != nil {
// 如果之前有内容,先保存之前的章节
if currentSection.Title != "" || len(currentContent) > 0 {
if currentSection.Title != "" {
currentSection.Content = strings.Join(currentContent, "\n")
sections = append(sections, currentSection)
}
}
// 开始新章节
level := len(matches[1]) // #号的数量
currentSection = MarkdownSection{
Title: trimmedLine,
Level: level,
Content: "",
}
currentContent = []string{}
} else {
// 普通内容行,添加到当前章节
currentContent = append(currentContent, line)
}
}
// 保存最后一个章节
if currentSection.Title != "" || len(currentContent) > 0 {
if currentSection.Title != "" {
currentSection.Content = strings.Join(currentContent, "\n")
sections = append(sections, currentSection)
} else if len(currentContent) > 0 {
// 如果没有标题,但开头有内容,作为第一个章节
sections = append(sections, MarkdownSection{
Title: "",
Level: 0,
Content: strings.Join(currentContent, "\n"),
})
}
}
return sections
}
// FormatContentAsMarkdownTable 将数据库中的数据格式化为标准的markdown表格格式
// 先进行预处理转换,再进行解析
func (mp *MarkdownProcessor) FormatContentAsMarkdownTable(content string) string {
if strings.TrimSpace(content) == "" {
return content
}
// 第一步预处理和转换标准化markdown格式
content = mp.markdownConverter.PreprocessContent(content)
// 如果内容已经是markdown表格格式包含|符号),检查格式是否正确
if strings.Contains(content, "|") {
// 检查是否已经是有效的markdown表格
lines := strings.Split(content, "\n")
hasTableFormat := false
for _, line := range lines {
trimmed := strings.TrimSpace(line)
// 跳过代码块中的内容
if strings.HasPrefix(trimmed, "```") {
continue
}
if strings.Contains(trimmed, "|") && !strings.HasPrefix(trimmed, "#") {
hasTableFormat = true
break
}
}
if hasTableFormat {
return content
}
}
// 提取代码块(保留代码块不变)
codeBlocks := mp.ExtractCodeBlocks(content)
// 移除代码块,只处理非代码块部分
contentWithoutCodeBlocks := mp.RemoveCodeBlocks(content)
// 如果移除代码块后内容为空,说明只有代码块,直接返回原始内容
if strings.TrimSpace(contentWithoutCodeBlocks) == "" {
return content
}
// 尝试解析非代码块部分为JSON数组仅当内容看起来像JSON时
trimmedContent := strings.TrimSpace(contentWithoutCodeBlocks)
// 检查是否看起来像JSON以[或{开头)
if strings.HasPrefix(trimmedContent, "[") || strings.HasPrefix(trimmedContent, "{") {
// 尝试解析为JSON数组
var requestParams []map[string]interface{}
if err := json.Unmarshal([]byte(trimmedContent), &requestParams); err == nil && len(requestParams) > 0 {
// 成功解析为JSON数组转换为markdown表格
tableContent := mp.jsonArrayToMarkdownTable(requestParams)
// 如果有代码块,在表格后添加代码块
if len(codeBlocks) > 0 {
return tableContent + "\n\n" + strings.Join(codeBlocks, "\n\n")
}
return tableContent
}
// 尝试解析为单个JSON对象
var singleObj map[string]interface{}
if err := json.Unmarshal([]byte(trimmedContent), &singleObj); err == nil {
// 检查是否是包含数组字段的对象
if params, ok := singleObj["params"].([]interface{}); ok {
// 转换为map数组
paramMaps := make([]map[string]interface{}, 0, len(params))
for _, p := range params {
if pm, ok := p.(map[string]interface{}); ok {
paramMaps = append(paramMaps, pm)
}
}
if len(paramMaps) > 0 {
tableContent := mp.jsonArrayToMarkdownTable(paramMaps)
// 如果有代码块,在表格后添加代码块
if len(codeBlocks) > 0 {
return tableContent + "\n\n" + strings.Join(codeBlocks, "\n\n")
}
return tableContent
}
}
if fields, ok := singleObj["fields"].([]interface{}); ok {
// 转换为map数组
fieldMaps := make([]map[string]interface{}, 0, len(fields))
for _, f := range fields {
if fm, ok := f.(map[string]interface{}); ok {
fieldMaps = append(fieldMaps, fm)
}
}
if len(fieldMaps) > 0 {
tableContent := mp.jsonArrayToMarkdownTable(fieldMaps)
// 如果有代码块,在表格后添加代码块
if len(codeBlocks) > 0 {
return tableContent + "\n\n" + strings.Join(codeBlocks, "\n\n")
}
return tableContent
}
}
}
}
// 如果无法解析为JSON返回原始内容保留代码块
return content
}
// ExtractCodeBlocks 提取内容中的所有代码块
func (mp *MarkdownProcessor) ExtractCodeBlocks(content string) []string {
var codeBlocks []string
lines := strings.Split(content, "\n")
inCodeBlock := false
var currentBlock []string
for _, line := range lines {
trimmed := strings.TrimSpace(line)
// 检查是否是代码块开始
if strings.HasPrefix(trimmed, "```") {
if inCodeBlock {
// 代码块结束
currentBlock = append(currentBlock, line)
codeBlocks = append(codeBlocks, strings.Join(currentBlock, "\n"))
currentBlock = []string{}
inCodeBlock = false
} else {
// 代码块开始
inCodeBlock = true
currentBlock = []string{line}
}
} else if inCodeBlock {
// 在代码块中
currentBlock = append(currentBlock, line)
}
}
// 如果代码块没有正确关闭,也添加进去
if inCodeBlock && len(currentBlock) > 0 {
codeBlocks = append(codeBlocks, strings.Join(currentBlock, "\n"))
}
return codeBlocks
}
// RemoveCodeBlocks 移除内容中的所有代码块
func (mp *MarkdownProcessor) RemoveCodeBlocks(content string) string {
lines := strings.Split(content, "\n")
var result []string
inCodeBlock := false
for _, line := range lines {
trimmed := strings.TrimSpace(line)
// 检查是否是代码块开始或结束
if strings.HasPrefix(trimmed, "```") {
inCodeBlock = !inCodeBlock
continue // 跳过代码块的标记行
}
// 如果不在代码块中,保留这一行
if !inCodeBlock {
result = append(result, line)
}
}
return strings.Join(result, "\n")
}
// jsonArrayToMarkdownTable 将JSON数组转换为标准的markdown表格
func (mp *MarkdownProcessor) jsonArrayToMarkdownTable(data []map[string]interface{}) string {
if len(data) == 0 {
return ""
}
var result strings.Builder
// 收集所有可能的列名(保持原始顺序)
// 使用map记录是否已添加使用slice保持顺序
columnSet := make(map[string]bool)
columns := make([]string, 0)
// 遍历所有数据行,按第一次出现的顺序收集列名
for _, row := range data {
for key := range row {
if !columnSet[key] {
columns = append(columns, key)
columnSet[key] = true
}
}
}
if len(columns) == 0 {
return ""
}
// 构建表头(直接使用原始列名,不做映射)
result.WriteString("|")
for _, col := range columns {
result.WriteString(" ")
result.WriteString(col) // 直接使用原始列名
result.WriteString(" |")
}
result.WriteString("\n")
// 构建分隔行
result.WriteString("|")
for range columns {
result.WriteString(" --- |")
}
result.WriteString("\n")
// 构建数据行
for _, row := range data {
result.WriteString("|")
for _, col := range columns {
result.WriteString(" ")
value := mp.formatCellValue(row[col])
result.WriteString(value)
result.WriteString(" |")
}
result.WriteString("\n")
}
return result.String()
}
// formatColumnName 格式化列名(直接返回原始列名,不做映射)
// 保持数据库原始数据的列名,不进行转换
func (mp *MarkdownProcessor) formatColumnName(name string) string {
// 直接返回原始列名,保持数据库数据的原始格式
return name
}
// formatCellValue 格式化单元格值
func (mp *MarkdownProcessor) formatCellValue(value interface{}) string {
if value == nil {
return ""
}
switch v := value.(type) {
case string:
// 清理字符串,移除换行符和多余空格
v = strings.ReplaceAll(v, "\n", " ")
v = strings.ReplaceAll(v, "\r", " ")
v = strings.TrimSpace(v)
// 转义markdown特殊字符
v = strings.ReplaceAll(v, "|", "\\|")
return v
case bool:
if v {
return "是"
}
return "否"
case float64:
// 如果是整数,不显示小数点
if v == float64(int64(v)) {
return fmt.Sprintf("%.0f", v)
}
return fmt.Sprintf("%g", v)
case int, int8, int16, int32, int64:
return fmt.Sprintf("%d", v)
case uint, uint8, uint16, uint32, uint64:
return fmt.Sprintf("%d", v)
default:
// 对于其他类型,转换为字符串
str := fmt.Sprintf("%v", v)
str = strings.ReplaceAll(str, "\n", " ")
str = strings.ReplaceAll(str, "\r", " ")
str = strings.ReplaceAll(str, "|", "\\|")
return strings.TrimSpace(str)
}
}