完善skills;测试用例生成页面功能初步实现
This commit is contained in:
@@ -137,6 +137,19 @@ class DocumentParser(ABC):
|
||||
chinese_numbers = '一二三四五六七八九十百千万'
|
||||
return text and all(c in chinese_numbers for c in text)
|
||||
|
||||
def _section_sort_key(self, section: 'Section') -> Tuple[int, List[int], str]:
|
||||
number = (section.number or "").strip()
|
||||
if number and re.match(r'^\d+(?:\.\d+)*$', number):
|
||||
return (0, [int(part) for part in number.split('.')], section.title or "")
|
||||
return (1, [section.level], section.title or "")
|
||||
|
||||
def _sort_sections_by_number(self, sections: List['Section']) -> List['Section']:
|
||||
ordered = sorted(sections, key=self._section_sort_key)
|
||||
for section in ordered:
|
||||
if section.children:
|
||||
section.children = self._sort_sections_by_number(section.children)
|
||||
return ordered
|
||||
|
||||
|
||||
class DocxParser(DocumentParser):
|
||||
"""DOCX格式文档解析器"""
|
||||
@@ -210,6 +223,7 @@ class DocxParser(DocumentParser):
|
||||
|
||||
# 为没有编号的章节自动生成编号
|
||||
self._auto_number_sections(self.sections)
|
||||
self.sections = self._sort_sections_by_number(self.sections)
|
||||
|
||||
logger.info(f"完成Docx解析,提取{len(self.sections)}个顶级章节")
|
||||
return self.sections
|
||||
@@ -236,12 +250,17 @@ class DocxParser(DocumentParser):
|
||||
"""解析标题,返回(编号, 标题, 级别)"""
|
||||
style_name = paragraph.style.name if paragraph.style else ""
|
||||
is_heading_style = style_name.lower().startswith('heading') if style_name else False
|
||||
|
||||
if self._is_calendar_line(text):
|
||||
return None
|
||||
|
||||
# 数字编号标题
|
||||
match = re.match(r'^(\d+(?:\.\d+)*)\s*[\.、]?\s*(.+)$', text)
|
||||
match = re.match(r'^(\d+(?:\.\d+)*)\s*[\.、.))::\-_/]?\s*(.+)$', text)
|
||||
if match and self._is_valid_heading(match.group(2)):
|
||||
number = match.group(1)
|
||||
title = match.group(2).strip()
|
||||
if not self._is_valid_numbered_heading(number, title):
|
||||
return None
|
||||
level = len(number.split('.'))
|
||||
return number, title, level
|
||||
|
||||
@@ -263,6 +282,31 @@ class DocxParser(DocumentParser):
|
||||
|
||||
return None
|
||||
|
||||
def _is_calendar_line(self, text: str) -> bool:
|
||||
value = (text or "").strip().replace(" ", "")
|
||||
return bool(re.match(r'^\d{4}年\d{1,2}月(?:\d{1,2}日)?$', value))
|
||||
|
||||
def _is_valid_numbered_heading(self, number: str, title: str) -> bool:
|
||||
parts = number.split('.')
|
||||
if len(parts) > 6:
|
||||
return False
|
||||
|
||||
first = int(parts[0])
|
||||
if first < 1 or first > 30:
|
||||
return False
|
||||
|
||||
for part in parts[1:]:
|
||||
if int(part) > 30:
|
||||
return False
|
||||
|
||||
if len(parts) == 1 and re.match(r'^年\d{1,2}月', title):
|
||||
return False
|
||||
|
||||
if title and title[0].isdigit():
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
def _iter_block_items(self, parent):
|
||||
"""按文档顺序迭代段落和表格"""
|
||||
from docx.text.paragraph import Paragraph
|
||||
@@ -356,6 +400,7 @@ class PDFParser(DocumentParser):
|
||||
|
||||
# 6. 为没有编号的章节自动生成编号
|
||||
self._auto_number_sections(self.sections)
|
||||
self.sections = self._sort_sections_by_number(self.sections)
|
||||
|
||||
logger.info(f"完成PDF解析,提取{len(self.sections)}个顶级章节")
|
||||
return self.sections
|
||||
@@ -599,18 +644,6 @@ class PDFParser(DocumentParser):
|
||||
level = len(number.split('.'))
|
||||
top_level_number = int(number.split('.')[0])
|
||||
|
||||
# 顶级章节序号大幅跳跃通常是误识别(如正文中的“8 表...”)。
|
||||
if level == 1 and last_top_level_number and top_level_number > last_top_level_number + 1:
|
||||
if line and not self._is_noise(line):
|
||||
content_buffer.append(line)
|
||||
continue
|
||||
|
||||
# 顶级章节编号倒退通常是正文枚举项被误识别(如“1 综合监控...”)。
|
||||
if level == 1 and last_top_level_number and top_level_number < last_top_level_number:
|
||||
if line and not self._is_noise(line):
|
||||
content_buffer.append(line)
|
||||
continue
|
||||
|
||||
if level > 6:
|
||||
continue
|
||||
|
||||
@@ -645,10 +678,6 @@ class PDFParser(DocumentParser):
|
||||
for l in list(section_stack.keys()):
|
||||
if l > level:
|
||||
del section_stack[l]
|
||||
|
||||
# 若出现层级跳跃(如1->3),自动回退到父级+1。
|
||||
if level > 1 and (level - 1) not in section_stack:
|
||||
section.level = max(section_stack.keys()) if section_stack else 1
|
||||
|
||||
current_section = section
|
||||
else:
|
||||
@@ -670,7 +699,10 @@ class PDFParser(DocumentParser):
|
||||
(章节编号, 章节标题) 或 None
|
||||
"""
|
||||
# 模式: "3.1 功能需求" / "3.1.2 电场..."
|
||||
match = re.match(r'^(\d+(?:\.\d+)*)[\s、.))]*(.+)$', line)
|
||||
if self._is_calendar_line(line):
|
||||
return None
|
||||
|
||||
match = re.match(r'^(\d+(?:\.\d+)*)[\s、..))::\-_/]*(.+)$', line)
|
||||
if not match:
|
||||
return None
|
||||
|
||||
@@ -692,7 +724,7 @@ class PDFParser(DocumentParser):
|
||||
|
||||
# 检查子部分是否合理
|
||||
for part in parts[1:]:
|
||||
if int(part) > 20:
|
||||
if int(part) > 30:
|
||||
return None
|
||||
|
||||
# 避免重复
|
||||
@@ -747,6 +779,10 @@ class PDFParser(DocumentParser):
|
||||
|
||||
return (number, title)
|
||||
|
||||
def _is_calendar_line(self, text: str) -> bool:
|
||||
value = (text or "").strip().replace(" ", "")
|
||||
return bool(re.match(r'^\d{4}年\d{1,2}月(?:\d{1,2}日)?$', value))
|
||||
|
||||
def _looks_like_statement(self, title: str) -> bool:
|
||||
"""判断标题是否更像正文语句而非章节名。"""
|
||||
if not title:
|
||||
|
||||
@@ -51,6 +51,8 @@ class SRSTool:
|
||||
"other": "低",
|
||||
}
|
||||
|
||||
UNKNOWN_INTERFACE_VALUES = {"", "未知", "unknown", "n/a", "-", "--", "无", "none", "null"}
|
||||
|
||||
def __init__(self) -> None:
|
||||
ToolRegistry.register(self.DEFINITION)
|
||||
|
||||
@@ -90,24 +92,78 @@ class SRSTool:
|
||||
normalized: List[Dict[str, Any]] = []
|
||||
for index, req in enumerate(extracted, start=1):
|
||||
description = (req.description or "").strip()
|
||||
title = description[:40] if description else f"需求项 {index}"
|
||||
title = self._build_short_title(description, index)
|
||||
requirement_type = self._normalize_requirement_type(
|
||||
req_type=getattr(req, "type", "functional"),
|
||||
interface_name=getattr(req, "interface_name", ""),
|
||||
interface_type=getattr(req, "interface_type", ""),
|
||||
data_source=getattr(req, "source", ""),
|
||||
data_destination=getattr(req, "destination", ""),
|
||||
)
|
||||
source_field = f"{req.section_number} {req.section_title}".strip() or "文档解析"
|
||||
normalized.append(
|
||||
{
|
||||
"id": req.id,
|
||||
"title": title,
|
||||
"description": description,
|
||||
"priority": self.PRIORITY_BY_TYPE.get(req.type, "中"),
|
||||
"priority": "中",
|
||||
"acceptance_criteria": [description] if description else ["待补充验收标准"],
|
||||
"source_field": source_field,
|
||||
"section_uid": req.section_uid,
|
||||
"section_number": req.section_number,
|
||||
"section_title": req.section_title,
|
||||
"requirement_type": req.type,
|
||||
"requirement_type": requirement_type,
|
||||
"interface_name": req.interface_name if requirement_type == "interface" else "",
|
||||
"interface_type": req.interface_type if requirement_type == "interface" else "",
|
||||
"data_source": req.source if requirement_type == "interface" else "",
|
||||
"data_destination": req.destination if requirement_type == "interface" else "",
|
||||
"sort_order": index,
|
||||
}
|
||||
)
|
||||
return normalized
|
||||
|
||||
def _normalize_requirement_type(
|
||||
self,
|
||||
req_type: Any,
|
||||
interface_name: Any,
|
||||
interface_type: Any,
|
||||
data_source: Any,
|
||||
data_destination: Any,
|
||||
) -> str:
|
||||
raw_type = str(req_type or "").strip()
|
||||
mapping = {
|
||||
"功能需求": "functional",
|
||||
"接口需求": "interface",
|
||||
"性能需求": "performance",
|
||||
"安全需求": "security",
|
||||
"可靠性需求": "reliability",
|
||||
"其他需求": "other",
|
||||
}
|
||||
normalized_type = mapping.get(raw_type, raw_type)
|
||||
if normalized_type not in self.PRIORITY_BY_TYPE:
|
||||
normalized_type = "functional"
|
||||
|
||||
fields = [interface_name, interface_type, data_source, data_destination]
|
||||
has_interface_fields = any(
|
||||
str(value or "").strip().lower() not in self.UNKNOWN_INTERFACE_VALUES for value in fields
|
||||
)
|
||||
|
||||
if normalized_type == "interface" or has_interface_fields:
|
||||
return "interface"
|
||||
return normalized_type
|
||||
|
||||
def _build_short_title(self, description: str, index: int) -> str:
|
||||
text = (description or "").strip()
|
||||
if not text:
|
||||
return f"需求项 {index}"
|
||||
for separator in ("。", ";", "\n", ";", "."):
|
||||
if separator in text:
|
||||
text = text.split(separator, 1)[0].strip()
|
||||
break
|
||||
if len(text) <= 20:
|
||||
return text
|
||||
return f"{text[:20].rstrip()}"
|
||||
|
||||
def _load_config(self) -> Dict[str, Any]:
|
||||
config_path = Path(__file__).with_name("default_config.yaml")
|
||||
if config_path.exists():
|
||||
|
||||
Reference in New Issue
Block a user