完善skills；测试用例生成页面功能初步实现

2026-05-05 19:45:33 +08:00
parent 0c2ed67e2a
commit 69b49d28b2
35 changed files with 4396 additions and 658 deletions
--- a/rag-web-ui/backend/app/tools/srs_reqs_qwen/src/document_parser.py
+++ b/rag-web-ui/backend/app/tools/srs_reqs_qwen/src/document_parser.py
@@ -137,6 +137,19 @@ class DocumentParser(ABC):
        chinese_numbers = '一二三四五六七八九十百千万'
        return text and all(c in chinese_numbers for c in text)

+    def _section_sort_key(self, section: 'Section') -> Tuple[int, List[int], str]:
+        number = (section.number or "").strip()
+        if number and re.match(r'^\d+(?:\.\d+)*$', number):
+            return (0, [int(part) for part in number.split('.')], section.title or "")
+        return (1, [section.level], section.title or "")
+
+    def _sort_sections_by_number(self, sections: List['Section']) -> List['Section']:
+        ordered = sorted(sections, key=self._section_sort_key)
+        for section in ordered:
+            if section.children:
+                section.children = self._sort_sections_by_number(section.children)
+        return ordered
+

 class DocxParser(DocumentParser):
    """DOCX格式文档解析器"""
@@ -210,6 +223,7 @@ class DocxParser(DocumentParser):
            
            # 为没有编号的章节自动生成编号
            self._auto_number_sections(self.sections)
+            self.sections = self._sort_sections_by_number(self.sections)
            
            logger.info(f"完成Docx解析，提取{len(self.sections)}个顶级章节")
            return self.sections
@@ -236,12 +250,17 @@ class DocxParser(DocumentParser):
        """解析标题，返回(编号, 标题, 级别)"""
        style_name = paragraph.style.name if paragraph.style else ""
        is_heading_style = style_name.lower().startswith('heading') if style_name else False
+
+        if self._is_calendar_line(text):
+            return None
        
        # 数字编号标题
-        match = re.match(r'^(\d+(?:\.\d+)*)\s*[\.、]?\s*(.+)$', text)
+        match = re.match(r'^(\d+(?:\.\d+)*)\s*[\.、．)）:：\-_/]?\s*(.+)$', text)
        if match and self._is_valid_heading(match.group(2)):
            number = match.group(1)
            title = match.group(2).strip()
+            if not self._is_valid_numbered_heading(number, title):
+                return None
            level = len(number.split('.'))
            return number, title, level
        
@@ -263,6 +282,31 @@ class DocxParser(DocumentParser):
        
        return None

+    def _is_calendar_line(self, text: str) -> bool:
+        value = (text or "").strip().replace(" ", "")
+        return bool(re.match(r'^\d{4}年\d{1,2}月(?:\d{1,2}日)?$', value))
+
+    def _is_valid_numbered_heading(self, number: str, title: str) -> bool:
+        parts = number.split('.')
+        if len(parts) > 6:
+            return False
+
+        first = int(parts[0])
+        if first < 1 or first > 30:
+            return False
+
+        for part in parts[1:]:
+            if int(part) > 30:
+                return False
+
+        if len(parts) == 1 and re.match(r'^年\d{1,2}月', title):
+            return False
+
+        if title and title[0].isdigit():
+            return False
+
+        return True
+
    def _iter_block_items(self, parent):
        """按文档顺序迭代段落和表格"""
        from docx.text.paragraph import Paragraph
@@ -356,6 +400,7 @@ class PDFParser(DocumentParser):
            
            # 6. 为没有编号的章节自动生成编号
            self._auto_number_sections(self.sections)
+            self.sections = self._sort_sections_by_number(self.sections)
            
            logger.info(f"完成PDF解析，提取{len(self.sections)}个顶级章节")
            return self.sections
@@ -599,18 +644,6 @@ class PDFParser(DocumentParser):
                level = len(number.split('.'))
                top_level_number = int(number.split('.')[0])

-                # 顶级章节序号大幅跳跃通常是误识别（如正文中的“8 表...”）。
-                if level == 1 and last_top_level_number and top_level_number > last_top_level_number + 1:
-                    if line and not self._is_noise(line):
-                        content_buffer.append(line)
-                    continue
-
-                # 顶级章节编号倒退通常是正文枚举项被误识别（如“1 综合监控...”）。
-                if level == 1 and last_top_level_number and top_level_number < last_top_level_number:
-                    if line and not self._is_noise(line):
-                        content_buffer.append(line)
-                    continue
-
                if level > 6:
                    continue
                
@@ -645,10 +678,6 @@ class PDFParser(DocumentParser):
                for l in list(section_stack.keys()):
                    if l > level:
                        del section_stack[l]
-
-                # 若出现层级跳跃（如1->3），自动回退到父级+1。
-                if level > 1 and (level - 1) not in section_stack:
-                    section.level = max(section_stack.keys()) if section_stack else 1
                
                current_section = section
            else:
@@ -670,7 +699,10 @@ class PDFParser(DocumentParser):
            (章节编号, 章节标题) 或 None
        """
        # 模式: "3.1 功能需求" / "3.1.2 电场..."
-        match = re.match(r'^(\d+(?:\.\d+)*)[\s、.)）]*(.+)$', line)
+        if self._is_calendar_line(line):
+            return None
+
+        match = re.match(r'^(\d+(?:\.\d+)*)[\s、.．)）:：\-_/]*(.+)$', line)
        if not match:
            return None
        
@@ -692,7 +724,7 @@ class PDFParser(DocumentParser):
        
        # 检查子部分是否合理
        for part in parts[1:]:
-            if int(part) > 20:
+            if int(part) > 30:
                return None
        
        # 避免重复
@@ -747,6 +779,10 @@ class PDFParser(DocumentParser):
        
        return (number, title)

+    def _is_calendar_line(self, text: str) -> bool:
+        value = (text or "").strip().replace(" ", "")
+        return bool(re.match(r'^\d{4}年\d{1,2}月(?:\d{1,2}日)?$', value))
+
    def _looks_like_statement(self, title: str) -> bool:
        """判断标题是否更像正文语句而非章节名。"""
        if not title: