356 lines
10 KiB
Go
356 lines
10 KiB
Go
package pdf
|
||
|
||
import (
|
||
"encoding/json"
|
||
"fmt"
|
||
"regexp"
|
||
"strings"
|
||
)
|
||
|
||
// MarkdownProcessor Markdown处理器
|
||
type MarkdownProcessor struct {
|
||
textProcessor *TextProcessor
|
||
markdownConverter *MarkdownConverter
|
||
}
|
||
|
||
// NewMarkdownProcessor 创建Markdown处理器
|
||
func NewMarkdownProcessor(textProcessor *TextProcessor) *MarkdownProcessor {
|
||
converter := NewMarkdownConverter(textProcessor)
|
||
return &MarkdownProcessor{
|
||
textProcessor: textProcessor,
|
||
markdownConverter: converter,
|
||
}
|
||
}
|
||
|
||
// MarkdownSection 表示一个markdown章节
|
||
type MarkdownSection struct {
|
||
Title string // 标题(包含#号)
|
||
Level int // 标题级别(## 是2, ### 是3, #### 是4)
|
||
Content string // 该章节的内容
|
||
}
|
||
|
||
// SplitByMarkdownHeaders 按markdown标题分割内容
|
||
func (mp *MarkdownProcessor) SplitByMarkdownHeaders(content string) []MarkdownSection {
|
||
lines := strings.Split(content, "\n")
|
||
var sections []MarkdownSection
|
||
var currentSection MarkdownSection
|
||
var currentContent []string
|
||
|
||
// 标题正则:匹配 #, ##, ###, #### 等
|
||
headerRegex := regexp.MustCompile(`^(#{1,6})\s+(.+)$`)
|
||
|
||
for _, line := range lines {
|
||
trimmedLine := strings.TrimSpace(line)
|
||
|
||
// 检查是否是标题行
|
||
if matches := headerRegex.FindStringSubmatch(trimmedLine); matches != nil {
|
||
// 如果之前有内容,先保存之前的章节
|
||
if currentSection.Title != "" || len(currentContent) > 0 {
|
||
if currentSection.Title != "" {
|
||
currentSection.Content = strings.Join(currentContent, "\n")
|
||
sections = append(sections, currentSection)
|
||
}
|
||
}
|
||
|
||
// 开始新章节
|
||
level := len(matches[1]) // #号的数量
|
||
currentSection = MarkdownSection{
|
||
Title: trimmedLine,
|
||
Level: level,
|
||
Content: "",
|
||
}
|
||
currentContent = []string{}
|
||
} else {
|
||
// 普通内容行,添加到当前章节
|
||
currentContent = append(currentContent, line)
|
||
}
|
||
}
|
||
|
||
// 保存最后一个章节
|
||
if currentSection.Title != "" || len(currentContent) > 0 {
|
||
if currentSection.Title != "" {
|
||
currentSection.Content = strings.Join(currentContent, "\n")
|
||
sections = append(sections, currentSection)
|
||
} else if len(currentContent) > 0 {
|
||
// 如果没有标题,但开头有内容,作为第一个章节
|
||
sections = append(sections, MarkdownSection{
|
||
Title: "",
|
||
Level: 0,
|
||
Content: strings.Join(currentContent, "\n"),
|
||
})
|
||
}
|
||
}
|
||
|
||
return sections
|
||
}
|
||
|
||
// FormatContentAsMarkdownTable 将数据库中的数据格式化为标准的markdown表格格式
|
||
// 先进行预处理转换,再进行解析
|
||
func (mp *MarkdownProcessor) FormatContentAsMarkdownTable(content string) string {
|
||
if strings.TrimSpace(content) == "" {
|
||
return content
|
||
}
|
||
|
||
// 第一步:预处理和转换(标准化markdown格式)
|
||
content = mp.markdownConverter.PreprocessContent(content)
|
||
|
||
// 如果内容已经是markdown表格格式(包含|符号),检查格式是否正确
|
||
if strings.Contains(content, "|") {
|
||
// 检查是否已经是有效的markdown表格
|
||
lines := strings.Split(content, "\n")
|
||
hasTableFormat := false
|
||
for _, line := range lines {
|
||
trimmed := strings.TrimSpace(line)
|
||
// 跳过代码块中的内容
|
||
if strings.HasPrefix(trimmed, "```") {
|
||
continue
|
||
}
|
||
if strings.Contains(trimmed, "|") && !strings.HasPrefix(trimmed, "#") {
|
||
hasTableFormat = true
|
||
break
|
||
}
|
||
}
|
||
if hasTableFormat {
|
||
return content
|
||
}
|
||
}
|
||
|
||
// 提取代码块(保留代码块不变)
|
||
codeBlocks := mp.ExtractCodeBlocks(content)
|
||
|
||
// 移除代码块,只处理非代码块部分
|
||
contentWithoutCodeBlocks := mp.RemoveCodeBlocks(content)
|
||
|
||
// 如果移除代码块后内容为空,说明只有代码块,直接返回原始内容
|
||
if strings.TrimSpace(contentWithoutCodeBlocks) == "" {
|
||
return content
|
||
}
|
||
|
||
// 尝试解析非代码块部分为JSON数组(仅当内容看起来像JSON时)
|
||
trimmedContent := strings.TrimSpace(contentWithoutCodeBlocks)
|
||
|
||
// 检查是否看起来像JSON(以[或{开头)
|
||
if strings.HasPrefix(trimmedContent, "[") || strings.HasPrefix(trimmedContent, "{") {
|
||
// 尝试解析为JSON数组
|
||
var requestParams []map[string]interface{}
|
||
if err := json.Unmarshal([]byte(trimmedContent), &requestParams); err == nil && len(requestParams) > 0 {
|
||
// 成功解析为JSON数组,转换为markdown表格
|
||
tableContent := mp.jsonArrayToMarkdownTable(requestParams)
|
||
// 如果有代码块,在表格后添加代码块
|
||
if len(codeBlocks) > 0 {
|
||
return tableContent + "\n\n" + strings.Join(codeBlocks, "\n\n")
|
||
}
|
||
return tableContent
|
||
}
|
||
|
||
// 尝试解析为单个JSON对象
|
||
var singleObj map[string]interface{}
|
||
if err := json.Unmarshal([]byte(trimmedContent), &singleObj); err == nil {
|
||
// 检查是否是包含数组字段的对象
|
||
if params, ok := singleObj["params"].([]interface{}); ok {
|
||
// 转换为map数组
|
||
paramMaps := make([]map[string]interface{}, 0, len(params))
|
||
for _, p := range params {
|
||
if pm, ok := p.(map[string]interface{}); ok {
|
||
paramMaps = append(paramMaps, pm)
|
||
}
|
||
}
|
||
if len(paramMaps) > 0 {
|
||
tableContent := mp.jsonArrayToMarkdownTable(paramMaps)
|
||
// 如果有代码块,在表格后添加代码块
|
||
if len(codeBlocks) > 0 {
|
||
return tableContent + "\n\n" + strings.Join(codeBlocks, "\n\n")
|
||
}
|
||
return tableContent
|
||
}
|
||
}
|
||
if fields, ok := singleObj["fields"].([]interface{}); ok {
|
||
// 转换为map数组
|
||
fieldMaps := make([]map[string]interface{}, 0, len(fields))
|
||
for _, f := range fields {
|
||
if fm, ok := f.(map[string]interface{}); ok {
|
||
fieldMaps = append(fieldMaps, fm)
|
||
}
|
||
}
|
||
if len(fieldMaps) > 0 {
|
||
tableContent := mp.jsonArrayToMarkdownTable(fieldMaps)
|
||
// 如果有代码块,在表格后添加代码块
|
||
if len(codeBlocks) > 0 {
|
||
return tableContent + "\n\n" + strings.Join(codeBlocks, "\n\n")
|
||
}
|
||
return tableContent
|
||
}
|
||
}
|
||
}
|
||
}
|
||
|
||
// 如果无法解析为JSON,返回原始内容(保留代码块)
|
||
return content
|
||
}
|
||
|
||
// ExtractCodeBlocks 提取内容中的所有代码块
|
||
func (mp *MarkdownProcessor) ExtractCodeBlocks(content string) []string {
|
||
var codeBlocks []string
|
||
lines := strings.Split(content, "\n")
|
||
inCodeBlock := false
|
||
var currentBlock []string
|
||
|
||
for _, line := range lines {
|
||
trimmed := strings.TrimSpace(line)
|
||
|
||
// 检查是否是代码块开始
|
||
if strings.HasPrefix(trimmed, "```") {
|
||
if inCodeBlock {
|
||
// 代码块结束
|
||
currentBlock = append(currentBlock, line)
|
||
codeBlocks = append(codeBlocks, strings.Join(currentBlock, "\n"))
|
||
currentBlock = []string{}
|
||
inCodeBlock = false
|
||
} else {
|
||
// 代码块开始
|
||
inCodeBlock = true
|
||
currentBlock = []string{line}
|
||
}
|
||
} else if inCodeBlock {
|
||
// 在代码块中
|
||
currentBlock = append(currentBlock, line)
|
||
}
|
||
}
|
||
|
||
// 如果代码块没有正确关闭,也添加进去
|
||
if inCodeBlock && len(currentBlock) > 0 {
|
||
codeBlocks = append(codeBlocks, strings.Join(currentBlock, "\n"))
|
||
}
|
||
|
||
return codeBlocks
|
||
}
|
||
|
||
// RemoveCodeBlocks 移除内容中的所有代码块
|
||
func (mp *MarkdownProcessor) RemoveCodeBlocks(content string) string {
|
||
lines := strings.Split(content, "\n")
|
||
var result []string
|
||
inCodeBlock := false
|
||
|
||
for _, line := range lines {
|
||
trimmed := strings.TrimSpace(line)
|
||
|
||
// 检查是否是代码块开始或结束
|
||
if strings.HasPrefix(trimmed, "```") {
|
||
inCodeBlock = !inCodeBlock
|
||
continue // 跳过代码块的标记行
|
||
}
|
||
|
||
// 如果不在代码块中,保留这一行
|
||
if !inCodeBlock {
|
||
result = append(result, line)
|
||
}
|
||
}
|
||
|
||
return strings.Join(result, "\n")
|
||
}
|
||
|
||
// jsonArrayToMarkdownTable 将JSON数组转换为标准的markdown表格
|
||
func (mp *MarkdownProcessor) jsonArrayToMarkdownTable(data []map[string]interface{}) string {
|
||
if len(data) == 0 {
|
||
return ""
|
||
}
|
||
|
||
var result strings.Builder
|
||
|
||
// 收集所有可能的列名(保持原始顺序)
|
||
// 使用map记录是否已添加,使用slice保持顺序
|
||
columnSet := make(map[string]bool)
|
||
columns := make([]string, 0)
|
||
|
||
// 遍历所有数据行,按第一次出现的顺序收集列名
|
||
for _, row := range data {
|
||
for key := range row {
|
||
if !columnSet[key] {
|
||
columns = append(columns, key)
|
||
columnSet[key] = true
|
||
}
|
||
}
|
||
}
|
||
|
||
if len(columns) == 0 {
|
||
return ""
|
||
}
|
||
|
||
// 构建表头(直接使用原始列名,不做映射)
|
||
result.WriteString("|")
|
||
for _, col := range columns {
|
||
result.WriteString(" ")
|
||
result.WriteString(col) // 直接使用原始列名
|
||
result.WriteString(" |")
|
||
}
|
||
result.WriteString("\n")
|
||
|
||
// 构建分隔行
|
||
result.WriteString("|")
|
||
for range columns {
|
||
result.WriteString(" --- |")
|
||
}
|
||
result.WriteString("\n")
|
||
|
||
// 构建数据行
|
||
for _, row := range data {
|
||
result.WriteString("|")
|
||
for _, col := range columns {
|
||
result.WriteString(" ")
|
||
value := mp.formatCellValue(row[col])
|
||
result.WriteString(value)
|
||
result.WriteString(" |")
|
||
}
|
||
result.WriteString("\n")
|
||
}
|
||
|
||
return result.String()
|
||
}
|
||
|
||
// formatColumnName 格式化列名(直接返回原始列名,不做映射)
|
||
// 保持数据库原始数据的列名,不进行转换
|
||
func (mp *MarkdownProcessor) formatColumnName(name string) string {
|
||
// 直接返回原始列名,保持数据库数据的原始格式
|
||
return name
|
||
}
|
||
|
||
// formatCellValue 格式化单元格值
|
||
func (mp *MarkdownProcessor) formatCellValue(value interface{}) string {
|
||
if value == nil {
|
||
return ""
|
||
}
|
||
|
||
switch v := value.(type) {
|
||
case string:
|
||
// 清理字符串,移除换行符和多余空格
|
||
v = strings.ReplaceAll(v, "\n", " ")
|
||
v = strings.ReplaceAll(v, "\r", " ")
|
||
v = strings.TrimSpace(v)
|
||
// 转义markdown特殊字符
|
||
v = strings.ReplaceAll(v, "|", "\\|")
|
||
return v
|
||
case bool:
|
||
if v {
|
||
return "是"
|
||
}
|
||
return "否"
|
||
case float64:
|
||
// 如果是整数,不显示小数点
|
||
if v == float64(int64(v)) {
|
||
return fmt.Sprintf("%.0f", v)
|
||
}
|
||
return fmt.Sprintf("%g", v)
|
||
case int, int8, int16, int32, int64:
|
||
return fmt.Sprintf("%d", v)
|
||
case uint, uint8, uint16, uint32, uint64:
|
||
return fmt.Sprintf("%d", v)
|
||
default:
|
||
// 对于其他类型,转换为字符串
|
||
str := fmt.Sprintf("%v", v)
|
||
str = strings.ReplaceAll(str, "\n", " ")
|
||
str = strings.ReplaceAll(str, "\r", " ")
|
||
str = strings.ReplaceAll(str, "|", "\\|")
|
||
return strings.TrimSpace(str)
|
||
}
|
||
}
|