616 lines
17 KiB
Go
616 lines
17 KiB
Go
package pdf
|
||
|
||
import (
|
||
"context"
|
||
"encoding/json"
|
||
"fmt"
|
||
"regexp"
|
||
"strings"
|
||
|
||
"tyapi-server/internal/domains/product/entities"
|
||
|
||
"go.uber.org/zap"
|
||
)
|
||
|
||
// DatabaseTableReader 数据库表格数据读取器
|
||
type DatabaseTableReader struct {
|
||
logger *zap.Logger
|
||
}
|
||
|
||
// NewDatabaseTableReader 创建数据库表格数据读取器
|
||
func NewDatabaseTableReader(logger *zap.Logger) *DatabaseTableReader {
|
||
return &DatabaseTableReader{
|
||
logger: logger,
|
||
}
|
||
}
|
||
|
||
// TableData 表格数据
|
||
type TableData struct {
|
||
Headers []string
|
||
Rows [][]string
|
||
}
|
||
|
||
// TableWithTitle 带标题的表格
|
||
type TableWithTitle struct {
|
||
Title string // 表格标题(markdown标题)
|
||
Table *TableData // 表格数据
|
||
}
|
||
|
||
// ReadTableFromDocumentation 从产品文档中读取表格数据
|
||
// 先将markdown表格转换为JSON格式,然后再转换为表格数据
|
||
func (r *DatabaseTableReader) ReadTableFromDocumentation(ctx context.Context, doc *entities.ProductDocumentation, fieldType string) (*TableData, error) {
|
||
var content string
|
||
|
||
switch fieldType {
|
||
case "request_params":
|
||
content = doc.RequestParams
|
||
case "response_fields":
|
||
content = doc.ResponseFields
|
||
case "response_example":
|
||
content = doc.ResponseExample
|
||
case "error_codes":
|
||
content = doc.ErrorCodes
|
||
default:
|
||
return nil, fmt.Errorf("未知的字段类型: %s", fieldType)
|
||
}
|
||
|
||
// 检查内容是否为空(去除空白字符后)
|
||
trimmedContent := strings.TrimSpace(content)
|
||
if trimmedContent == "" {
|
||
return nil, fmt.Errorf("字段 %s 内容为空", fieldType)
|
||
}
|
||
|
||
// 先尝试解析为JSON数组(如果已经是JSON格式)
|
||
var jsonArray []map[string]interface{}
|
||
if err := json.Unmarshal([]byte(content), &jsonArray); err == nil && len(jsonArray) > 0 {
|
||
r.logger.Info("数据已经是JSON格式,直接使用",
|
||
zap.String("field_type", fieldType),
|
||
zap.Int("json_array_length", len(jsonArray)))
|
||
return r.convertJSONArrayToTable(jsonArray), nil
|
||
}
|
||
|
||
// 尝试解析为单个JSON对象(包含数组字段)
|
||
var jsonObj map[string]interface{}
|
||
if err := json.Unmarshal([]byte(content), &jsonObj); err == nil {
|
||
// 查找包含数组的字段
|
||
for _, value := range jsonObj {
|
||
if arr, ok := value.([]interface{}); ok && len(arr) > 0 {
|
||
// 转换为map数组
|
||
mapArray := make([]map[string]interface{}, 0, len(arr))
|
||
for _, item := range arr {
|
||
if itemMap, ok := item.(map[string]interface{}); ok {
|
||
mapArray = append(mapArray, itemMap)
|
||
}
|
||
}
|
||
if len(mapArray) > 0 {
|
||
r.logger.Info("从JSON对象中提取数组数据", zap.String("field_type", fieldType))
|
||
return r.convertJSONArrayToTable(mapArray), nil
|
||
}
|
||
}
|
||
}
|
||
}
|
||
|
||
// 如果不是JSON格式,先解析为markdown表格,然后转换为JSON格式
|
||
r.logger.Info("开始解析markdown表格并转换为JSON",
|
||
zap.String("field_type", fieldType),
|
||
zap.Int("content_length", len(content)),
|
||
zap.String("content_preview", r.getContentPreview(content, 200)))
|
||
|
||
tableData, err := r.parseMarkdownTable(content)
|
||
if err != nil {
|
||
// 错误已返回,不记录日志
|
||
return nil, fmt.Errorf("解析markdown表格失败: %w", err)
|
||
}
|
||
|
||
r.logger.Info("markdown表格解析成功",
|
||
zap.String("field_type", fieldType),
|
||
zap.Int("header_count", len(tableData.Headers)),
|
||
zap.Int("row_count", len(tableData.Rows)),
|
||
zap.Strings("headers", tableData.Headers))
|
||
|
||
// 将markdown表格数据转换为JSON格式(保持列顺序)
|
||
r.logger.Debug("开始将表格数据转换为JSON格式", zap.String("field_type", fieldType))
|
||
jsonArray = r.convertTableDataToJSON(tableData)
|
||
|
||
r.logger.Info("表格数据已转换为JSON格式",
|
||
zap.String("field_type", fieldType),
|
||
zap.Int("json_array_length", len(jsonArray)))
|
||
|
||
// 记录转换后的JSON(用于调试)
|
||
jsonBytes, marshalErr := json.MarshalIndent(jsonArray, "", " ")
|
||
if marshalErr != nil {
|
||
r.logger.Warn("JSON序列化失败",
|
||
zap.String("field_type", fieldType),
|
||
zap.Error(marshalErr))
|
||
} else {
|
||
previewLen := len(jsonBytes)
|
||
if previewLen > 1000 {
|
||
previewLen = 1000
|
||
}
|
||
r.logger.Debug("转换后的JSON数据预览",
|
||
zap.String("field_type", fieldType),
|
||
zap.Int("json_length", len(jsonBytes)),
|
||
zap.String("json_preview", string(jsonBytes[:previewLen])))
|
||
|
||
// 如果JSON数据较大,记录完整路径提示
|
||
if len(jsonBytes) > 1000 {
|
||
r.logger.Info("JSON数据较大,完整内容请查看debug级别日志",
|
||
zap.String("field_type", fieldType),
|
||
zap.Int("json_length", len(jsonBytes)))
|
||
}
|
||
}
|
||
|
||
// 将JSON数据转换回表格数据用于渲染(使用原始表头顺序保持列顺序)
|
||
return r.convertJSONArrayToTableWithOrder(jsonArray, tableData.Headers), nil
|
||
}
|
||
|
||
// convertJSONArrayToTable 将JSON数组转换为表格数据(用于已经是JSON格式的数据)
|
||
func (r *DatabaseTableReader) convertJSONArrayToTable(data []map[string]interface{}) *TableData {
|
||
if len(data) == 0 {
|
||
return &TableData{
|
||
Headers: []string{},
|
||
Rows: [][]string{},
|
||
}
|
||
}
|
||
|
||
// 收集所有列名(按第一次出现的顺序)
|
||
columnSet := make(map[string]bool)
|
||
columns := make([]string, 0)
|
||
|
||
// 从第一行开始收集列名,保持第一次出现的顺序
|
||
for _, row := range data {
|
||
for key := range row {
|
||
if !columnSet[key] {
|
||
columns = append(columns, key)
|
||
columnSet[key] = true
|
||
}
|
||
}
|
||
// 只从第一行收集,保持顺序
|
||
if len(columns) > 0 {
|
||
break
|
||
}
|
||
}
|
||
|
||
// 如果第一行没有收集到所有列,继续收集(但顺序可能不稳定)
|
||
if len(columns) == 0 {
|
||
for _, row := range data {
|
||
for key := range row {
|
||
if !columnSet[key] {
|
||
columns = append(columns, key)
|
||
columnSet[key] = true
|
||
}
|
||
}
|
||
}
|
||
}
|
||
|
||
// 构建表头
|
||
headers := make([]string, len(columns))
|
||
copy(headers, columns)
|
||
|
||
// 构建数据行
|
||
rows := make([][]string, 0, len(data))
|
||
for _, row := range data {
|
||
rowData := make([]string, len(columns))
|
||
for i, col := range columns {
|
||
value := row[col]
|
||
rowData[i] = r.formatValue(value)
|
||
}
|
||
rows = append(rows, rowData)
|
||
}
|
||
|
||
return &TableData{
|
||
Headers: headers,
|
||
Rows: rows,
|
||
}
|
||
}
|
||
|
||
// convertJSONArrayToTableWithOrder 将JSON数组转换为表格数据(使用指定的列顺序)
|
||
func (r *DatabaseTableReader) convertJSONArrayToTableWithOrder(data []map[string]interface{}, originalHeaders []string) *TableData {
|
||
if len(data) == 0 {
|
||
return &TableData{
|
||
Headers: originalHeaders,
|
||
Rows: [][]string{},
|
||
}
|
||
}
|
||
|
||
// 使用原始表头顺序
|
||
headers := make([]string, len(originalHeaders))
|
||
copy(headers, originalHeaders)
|
||
|
||
// 构建数据行,按照原始表头顺序
|
||
rows := make([][]string, 0, len(data))
|
||
for _, row := range data {
|
||
rowData := make([]string, len(headers))
|
||
for i, header := range headers {
|
||
value := row[header]
|
||
rowData[i] = r.formatValue(value)
|
||
}
|
||
rows = append(rows, rowData)
|
||
}
|
||
|
||
r.logger.Debug("JSON转表格完成(保持列顺序)",
|
||
zap.Int("header_count", len(headers)),
|
||
zap.Int("row_count", len(rows)),
|
||
zap.Strings("headers", headers))
|
||
|
||
return &TableData{
|
||
Headers: headers,
|
||
Rows: rows,
|
||
}
|
||
}
|
||
|
||
// parseMarkdownTablesWithTitles 解析markdown格式的表格(支持多个表格,保留标题)
|
||
func (r *DatabaseTableReader) parseMarkdownTablesWithTitles(content string) ([]TableWithTitle, error) {
|
||
lines := strings.Split(content, "\n")
|
||
var result []TableWithTitle
|
||
var currentTitle string
|
||
var currentHeaders []string
|
||
var currentRows [][]string
|
||
inTable := false
|
||
hasValidHeader := false
|
||
nonTableLineCount := 0
|
||
maxNonTableLines := 3 // 允许最多3个连续非表格行
|
||
|
||
for _, line := range lines {
|
||
line = strings.TrimSpace(line)
|
||
|
||
// 处理markdown标题行(以#开头)- 保存标题
|
||
if strings.HasPrefix(line, "#") {
|
||
// 如果当前有表格,先保存
|
||
if inTable && len(currentHeaders) > 0 {
|
||
result = append(result, TableWithTitle{
|
||
Title: currentTitle,
|
||
Table: &TableData{
|
||
Headers: currentHeaders,
|
||
Rows: currentRows,
|
||
},
|
||
})
|
||
currentHeaders = nil
|
||
currentRows = nil
|
||
inTable = false
|
||
hasValidHeader = false
|
||
}
|
||
// 提取标题(移除#和空格)
|
||
currentTitle = strings.TrimSpace(strings.TrimPrefix(line, "#"))
|
||
currentTitle = strings.TrimSpace(strings.TrimPrefix(currentTitle, "#"))
|
||
currentTitle = strings.TrimSpace(strings.TrimPrefix(currentTitle, "#"))
|
||
nonTableLineCount = 0
|
||
continue
|
||
}
|
||
|
||
// 跳过空行
|
||
if line == "" {
|
||
if inTable {
|
||
nonTableLineCount++
|
||
if nonTableLineCount > maxNonTableLines {
|
||
// 当前表格结束,保存并重置
|
||
if len(currentHeaders) > 0 {
|
||
result = append(result, TableWithTitle{
|
||
Title: currentTitle,
|
||
Table: &TableData{
|
||
Headers: currentHeaders,
|
||
Rows: currentRows,
|
||
},
|
||
})
|
||
currentHeaders = nil
|
||
currentRows = nil
|
||
currentTitle = ""
|
||
}
|
||
inTable = false
|
||
hasValidHeader = false
|
||
nonTableLineCount = 0
|
||
}
|
||
}
|
||
continue
|
||
}
|
||
|
||
// 检查是否是markdown表格行
|
||
if !strings.Contains(line, "|") {
|
||
// 如果已经在表格中,遇到非表格行则计数
|
||
if inTable {
|
||
nonTableLineCount++
|
||
// 如果连续非表格行过多,表格结束
|
||
if nonTableLineCount > maxNonTableLines {
|
||
// 当前表格结束,保存并重置
|
||
if len(currentHeaders) > 0 {
|
||
result = append(result, TableWithTitle{
|
||
Title: currentTitle,
|
||
Table: &TableData{
|
||
Headers: currentHeaders,
|
||
Rows: currentRows,
|
||
},
|
||
})
|
||
currentHeaders = nil
|
||
currentRows = nil
|
||
currentTitle = ""
|
||
}
|
||
inTable = false
|
||
hasValidHeader = false
|
||
nonTableLineCount = 0
|
||
}
|
||
}
|
||
continue
|
||
}
|
||
|
||
// 重置非表格行计数(遇到表格行了)
|
||
nonTableLineCount = 0
|
||
|
||
// 跳过分隔行
|
||
if r.isSeparatorLine(line) {
|
||
// 分隔行后应该开始数据行
|
||
if hasValidHeader {
|
||
continue
|
||
}
|
||
// 如果还没有表头,跳过分隔行
|
||
continue
|
||
}
|
||
|
||
// 解析表格行
|
||
cells := strings.Split(line, "|")
|
||
// 清理首尾空元素
|
||
if len(cells) > 0 && strings.TrimSpace(cells[0]) == "" {
|
||
cells = cells[1:]
|
||
}
|
||
if len(cells) > 0 && strings.TrimSpace(cells[len(cells)-1]) == "" {
|
||
cells = cells[:len(cells)-1]
|
||
}
|
||
|
||
// 清理每个单元格,过滤空字符
|
||
cleanedCells := make([]string, 0, len(cells))
|
||
for _, cell := range cells {
|
||
cleaned := strings.TrimSpace(cell)
|
||
// 移除HTML标签(如<br>)
|
||
cleaned = r.removeHTMLTags(cleaned)
|
||
cleanedCells = append(cleanedCells, cleaned)
|
||
}
|
||
|
||
// 检查这一行是否有有效内容
|
||
hasContent := false
|
||
for _, cell := range cleanedCells {
|
||
if strings.TrimSpace(cell) != "" {
|
||
hasContent = true
|
||
break
|
||
}
|
||
}
|
||
|
||
if !hasContent || len(cleanedCells) == 0 {
|
||
continue
|
||
}
|
||
|
||
if !inTable {
|
||
// 第一行作为表头
|
||
currentHeaders = cleanedCells
|
||
inTable = true
|
||
hasValidHeader = true
|
||
} else {
|
||
// 数据行,确保列数与表头一致
|
||
row := make([]string, len(currentHeaders))
|
||
for i := range row {
|
||
if i < len(cleanedCells) {
|
||
row[i] = cleanedCells[i]
|
||
} else {
|
||
row[i] = ""
|
||
}
|
||
}
|
||
// 检查数据行是否有有效内容(至少有一个非空单元格)
|
||
hasData := false
|
||
for _, cell := range row {
|
||
if strings.TrimSpace(cell) != "" {
|
||
hasData = true
|
||
break
|
||
}
|
||
}
|
||
// 只添加有有效内容的数据行
|
||
if hasData {
|
||
currentRows = append(currentRows, row)
|
||
}
|
||
}
|
||
}
|
||
|
||
// 处理最后一个表格
|
||
if len(currentHeaders) > 0 {
|
||
result = append(result, TableWithTitle{
|
||
Title: currentTitle,
|
||
Table: &TableData{
|
||
Headers: currentHeaders,
|
||
Rows: currentRows,
|
||
},
|
||
})
|
||
}
|
||
|
||
if len(result) == 0 {
|
||
return nil, fmt.Errorf("无法解析表格:未找到表头")
|
||
}
|
||
|
||
r.logger.Info("解析多个表格完成",
|
||
zap.Int("table_count", len(result)))
|
||
|
||
return result, nil
|
||
}
|
||
|
||
// parseMarkdownTable 解析markdown格式的表格(兼容方法,调用新方法)
|
||
func (r *DatabaseTableReader) parseMarkdownTable(content string) (*TableData, error) {
|
||
tablesWithTitles, err := r.parseMarkdownTablesWithTitles(content)
|
||
if err != nil {
|
||
return nil, err
|
||
}
|
||
if len(tablesWithTitles) == 0 {
|
||
return nil, fmt.Errorf("未找到任何表格")
|
||
}
|
||
// 返回第一个表格(向后兼容)
|
||
return tablesWithTitles[0].Table, nil
|
||
}
|
||
|
||
// mergeTables 合并多个表格(使用最宽的表头)
|
||
func (r *DatabaseTableReader) mergeTables(existingHeaders []string, existingRows [][]string, newHeaders []string, newRows [][]string) ([]string, [][]string) {
|
||
// 如果这是第一个表格,直接返回
|
||
if len(existingHeaders) == 0 {
|
||
return newHeaders, newRows
|
||
}
|
||
|
||
// 使用最宽的表头(列数最多的)
|
||
var finalHeaders []string
|
||
if len(newHeaders) > len(existingHeaders) {
|
||
finalHeaders = make([]string, len(newHeaders))
|
||
copy(finalHeaders, newHeaders)
|
||
} else {
|
||
finalHeaders = make([]string, len(existingHeaders))
|
||
copy(finalHeaders, existingHeaders)
|
||
}
|
||
|
||
// 合并所有行,确保列数与最终表头一致
|
||
mergedRows := make([][]string, 0, len(existingRows)+len(newRows))
|
||
|
||
// 添加已有行
|
||
for _, row := range existingRows {
|
||
adjustedRow := make([]string, len(finalHeaders))
|
||
copy(adjustedRow, row)
|
||
mergedRows = append(mergedRows, adjustedRow)
|
||
}
|
||
|
||
// 添加新行
|
||
for _, row := range newRows {
|
||
adjustedRow := make([]string, len(finalHeaders))
|
||
for i := range adjustedRow {
|
||
if i < len(row) {
|
||
adjustedRow[i] = row[i]
|
||
} else {
|
||
adjustedRow[i] = ""
|
||
}
|
||
}
|
||
mergedRows = append(mergedRows, adjustedRow)
|
||
}
|
||
|
||
return finalHeaders, mergedRows
|
||
}
|
||
|
||
// removeHTMLTags 移除HTML标签(如<br>)和样式信息
|
||
func (r *DatabaseTableReader) removeHTMLTags(text string) string {
|
||
// 先移除所有HTML标签(包括带样式的标签,如 <span style="color:red">)
|
||
// 使用正则表达式移除所有HTML标签及其内容
|
||
re := regexp.MustCompile(`<[^>]+>`)
|
||
text = re.ReplaceAllString(text, "")
|
||
|
||
// 替换常见的HTML换行标签为空格
|
||
text = strings.ReplaceAll(text, "<br>", " ")
|
||
text = strings.ReplaceAll(text, "<br/>", " ")
|
||
text = strings.ReplaceAll(text, "<br />", " ")
|
||
text = strings.ReplaceAll(text, "\n", " ")
|
||
|
||
// 移除HTML实体
|
||
text = strings.ReplaceAll(text, " ", " ")
|
||
text = strings.ReplaceAll(text, "&", "&")
|
||
text = strings.ReplaceAll(text, "<", "<")
|
||
text = strings.ReplaceAll(text, ">", ">")
|
||
text = strings.ReplaceAll(text, """, "\"")
|
||
text = strings.ReplaceAll(text, "'", "'")
|
||
|
||
return strings.TrimSpace(text)
|
||
}
|
||
|
||
// isSeparatorLine 检查是否是markdown表格的分隔行
|
||
func (r *DatabaseTableReader) isSeparatorLine(line string) bool {
|
||
if !strings.Contains(line, "-") {
|
||
return false
|
||
}
|
||
for _, r := range line {
|
||
if r != '|' && r != '-' && r != ':' && r != ' ' {
|
||
return false
|
||
}
|
||
}
|
||
return true
|
||
}
|
||
|
||
// convertTableDataToJSON 将表格数据转换为JSON数组格式
|
||
func (r *DatabaseTableReader) convertTableDataToJSON(tableData *TableData) []map[string]interface{} {
|
||
if tableData == nil || len(tableData.Headers) == 0 {
|
||
r.logger.Warn("表格数据为空,无法转换为JSON")
|
||
return []map[string]interface{}{}
|
||
}
|
||
|
||
jsonArray := make([]map[string]interface{}, 0, len(tableData.Rows))
|
||
validRowCount := 0
|
||
|
||
for rowIndex, row := range tableData.Rows {
|
||
rowObj := make(map[string]interface{})
|
||
for i, header := range tableData.Headers {
|
||
// 获取对应的单元格值
|
||
var cellValue string
|
||
if i < len(row) {
|
||
cellValue = strings.TrimSpace(row[i])
|
||
}
|
||
// 将表头作为key,单元格值作为value
|
||
header = strings.TrimSpace(header)
|
||
if header != "" {
|
||
rowObj[header] = cellValue
|
||
}
|
||
}
|
||
// 只添加有有效数据的行
|
||
if len(rowObj) > 0 {
|
||
jsonArray = append(jsonArray, rowObj)
|
||
validRowCount++
|
||
} else {
|
||
r.logger.Debug("跳过空行",
|
||
zap.Int("row_index", rowIndex))
|
||
}
|
||
}
|
||
|
||
r.logger.Debug("表格转JSON完成",
|
||
zap.Int("total_rows", len(tableData.Rows)),
|
||
zap.Int("valid_rows", validRowCount),
|
||
zap.Int("json_array_length", len(jsonArray)))
|
||
|
||
return jsonArray
|
||
}
|
||
|
||
// getContentPreview 获取内容预览(用于日志记录)
|
||
func (r *DatabaseTableReader) getContentPreview(content string, maxLen int) string {
|
||
content = strings.TrimSpace(content)
|
||
if len(content) <= maxLen {
|
||
return content
|
||
}
|
||
return content[:maxLen] + "..."
|
||
}
|
||
|
||
// formatValue 格式化值为字符串
|
||
func (r *DatabaseTableReader) formatValue(value interface{}) string {
|
||
if value == nil {
|
||
return ""
|
||
}
|
||
|
||
var result string
|
||
switch v := value.(type) {
|
||
case string:
|
||
result = strings.TrimSpace(v)
|
||
// 如果去除空白后为空,返回空字符串
|
||
if result == "" {
|
||
return ""
|
||
}
|
||
// 移除HTML标签和样式,确保数据干净
|
||
result = r.removeHTMLTags(result)
|
||
return result
|
||
case bool:
|
||
if v {
|
||
return "是"
|
||
}
|
||
return "否"
|
||
case float64:
|
||
if v == float64(int64(v)) {
|
||
return fmt.Sprintf("%.0f", v)
|
||
}
|
||
return fmt.Sprintf("%g", v)
|
||
case int, int8, int16, int32, int64:
|
||
return fmt.Sprintf("%d", v)
|
||
case uint, uint8, uint16, uint32, uint64:
|
||
return fmt.Sprintf("%d", v)
|
||
default:
|
||
result = fmt.Sprintf("%v", v)
|
||
// 去除空白字符
|
||
result = strings.TrimSpace(result)
|
||
if result == "" {
|
||
return ""
|
||
}
|
||
return result
|
||
}
|
||
}
|