2026-02-03 22:48:22 +08:00
|
|
|
|
#!/usr/bin/env python3
|
|
|
|
|
|
# -*- coding: utf-8 -*-
|
|
|
|
|
|
"""
|
|
|
|
|
|
SRS 解析工具 - 主程序入口
|
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
|
|
import argparse
|
|
|
|
|
|
import os
|
|
|
|
|
|
import sys
|
|
|
|
|
|
import logging
|
|
|
|
|
|
from pathlib import Path
|
|
|
|
|
|
|
|
|
|
|
|
# 添加当前目录到Python路径
|
|
|
|
|
|
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
|
|
|
|
|
|
|
|
|
|
|
|
from src.utils import load_config, setup_logging, validate_file_path, ensure_directory_exists, get_env_or_config
|
|
|
|
|
|
from src.document_parser import create_parser
|
2026-04-18 20:33:58 +08:00
|
|
|
|
from src.document_parser import Section
|
2026-02-03 22:48:22 +08:00
|
|
|
|
from src.requirement_extractor import RequirementExtractor
|
|
|
|
|
|
from src.json_generator import JSONGenerator
|
|
|
|
|
|
|
|
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def create_llm(config: dict):
|
|
|
|
|
|
"""
|
|
|
|
|
|
创建LLM实例
|
|
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
|
config: 配置字典
|
|
|
|
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
|
|
LLM实例或None
|
|
|
|
|
|
"""
|
|
|
|
|
|
llm_config = config.get('llm', {})
|
|
|
|
|
|
|
2026-04-18 20:33:58 +08:00
|
|
|
|
# 当前版本仅支持LLM模式
|
2026-02-03 22:48:22 +08:00
|
|
|
|
if not llm_config.get('enabled', True):
|
2026-04-18 20:33:58 +08:00
|
|
|
|
raise ValueError("当前版本仅支持LLM模式,请将配置 llm.enabled 设为 true")
|
2026-02-03 22:48:22 +08:00
|
|
|
|
|
|
|
|
|
|
provider = llm_config.get('provider', 'qwen')
|
|
|
|
|
|
|
|
|
|
|
|
# 获取API密钥(优先使用环境变量)
|
|
|
|
|
|
api_key = get_env_or_config('DASHSCOPE_API_KEY', llm_config.get('api_key'))
|
|
|
|
|
|
|
|
|
|
|
|
if not api_key:
|
2026-04-18 20:33:58 +08:00
|
|
|
|
raise ValueError("未配置API密钥:请设置环境变量 DASHSCOPE_API_KEY 或在 config.yaml 中配置 llm.api_key")
|
2026-02-03 22:48:22 +08:00
|
|
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
|
from src.llm_interface import QwenLLM
|
|
|
|
|
|
|
|
|
|
|
|
model = llm_config.get('model', 'qwen-plus')
|
|
|
|
|
|
temperature = llm_config.get('temperature', 0.3)
|
|
|
|
|
|
max_tokens = llm_config.get('max_tokens', 1024)
|
|
|
|
|
|
|
|
|
|
|
|
llm = QwenLLM(
|
|
|
|
|
|
api_key=api_key,
|
|
|
|
|
|
model=model,
|
|
|
|
|
|
temperature=temperature,
|
|
|
|
|
|
max_tokens=max_tokens
|
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
logger.info(f"成功创建LLM实例: {provider} ({model})")
|
|
|
|
|
|
return llm
|
|
|
|
|
|
|
|
|
|
|
|
except ImportError as e:
|
2026-04-18 20:33:58 +08:00
|
|
|
|
raise RuntimeError(f"无法导入LLM模块: {e}。请安装依赖:pip install dashscope") from e
|
2026-02-03 22:48:22 +08:00
|
|
|
|
except Exception as e:
|
2026-04-18 20:33:58 +08:00
|
|
|
|
raise RuntimeError(f"创建LLM实例失败: {e}") from e
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def parse_chapter_selector(selector: str) -> list:
|
|
|
|
|
|
"""解析章节筛选参数。"""
|
|
|
|
|
|
if not selector:
|
|
|
|
|
|
return []
|
|
|
|
|
|
chapters = [x.strip() for x in selector.split(',') if x.strip()]
|
|
|
|
|
|
valid = []
|
|
|
|
|
|
for chapter in chapters:
|
|
|
|
|
|
if not chapter or not all(p.isdigit() for p in chapter.split('.')):
|
|
|
|
|
|
raise ValueError(f"无效章节编号: {chapter},仅支持如 3 或 3.1 的格式")
|
|
|
|
|
|
valid.append(chapter)
|
|
|
|
|
|
return valid
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _clone_section_with_children(section: Section) -> Section:
|
|
|
|
|
|
copied = Section(
|
|
|
|
|
|
level=section.level,
|
|
|
|
|
|
title=section.title,
|
|
|
|
|
|
number=section.number,
|
|
|
|
|
|
content=section.content,
|
|
|
|
|
|
uid=section.uid,
|
|
|
|
|
|
)
|
|
|
|
|
|
copied.tables = list(section.tables)
|
|
|
|
|
|
copied.blocks = list(section.blocks)
|
|
|
|
|
|
for child in section.children:
|
|
|
|
|
|
copied.add_child(_clone_section_with_children(child))
|
|
|
|
|
|
return copied
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def filter_sections_by_chapters(sections: list, chapters: list) -> list:
|
|
|
|
|
|
"""按章节前缀过滤章节树(如3匹配3及3.x)。"""
|
|
|
|
|
|
if not chapters:
|
|
|
|
|
|
return sections
|
|
|
|
|
|
|
|
|
|
|
|
def matched(number: str) -> bool:
|
|
|
|
|
|
number = (number or "").strip()
|
|
|
|
|
|
if not number:
|
|
|
|
|
|
return False
|
|
|
|
|
|
for chapter in chapters:
|
|
|
|
|
|
if number == chapter or number.startswith(f"{chapter}."):
|
|
|
|
|
|
return True
|
|
|
|
|
|
return False
|
|
|
|
|
|
|
|
|
|
|
|
def recurse(section: Section) -> Section:
|
|
|
|
|
|
if matched(section.number):
|
|
|
|
|
|
return _clone_section_with_children(section)
|
|
|
|
|
|
|
|
|
|
|
|
copied = Section(
|
|
|
|
|
|
level=section.level,
|
|
|
|
|
|
title=section.title,
|
|
|
|
|
|
number=section.number,
|
|
|
|
|
|
content=section.content,
|
|
|
|
|
|
uid=section.uid,
|
|
|
|
|
|
)
|
|
|
|
|
|
copied.tables = list(section.tables)
|
|
|
|
|
|
copied.blocks = list(section.blocks)
|
|
|
|
|
|
|
|
|
|
|
|
for child in section.children:
|
|
|
|
|
|
filtered_child = recurse(child)
|
|
|
|
|
|
if filtered_child:
|
|
|
|
|
|
copied.add_child(filtered_child)
|
|
|
|
|
|
|
|
|
|
|
|
return copied if copied.children else None
|
|
|
|
|
|
|
|
|
|
|
|
filtered = []
|
|
|
|
|
|
for s in sections:
|
|
|
|
|
|
fs = recurse(s)
|
|
|
|
|
|
if fs:
|
|
|
|
|
|
filtered.append(fs)
|
|
|
|
|
|
return filtered
|
2026-02-03 22:48:22 +08:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def main():
|
|
|
|
|
|
"""主程序入口"""
|
|
|
|
|
|
|
|
|
|
|
|
# 解析命令行参数
|
|
|
|
|
|
parser = argparse.ArgumentParser(
|
|
|
|
|
|
description='SRS需求文档解析工具',
|
|
|
|
|
|
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
|
|
|
|
epilog="""
|
|
|
|
|
|
示例用法:
|
|
|
|
|
|
python main.py --input sample.pdf --output output.json
|
|
|
|
|
|
python main.py -i requirements.docx -o output.json --verbose
|
2026-04-18 20:33:58 +08:00
|
|
|
|
python main.py -i DC-SRS.pdf -o output.json
|
2026-02-03 22:48:22 +08:00
|
|
|
|
"""
|
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
parser.add_argument(
|
|
|
|
|
|
'--input', '-i',
|
|
|
|
|
|
type=str,
|
|
|
|
|
|
required=True,
|
|
|
|
|
|
help='输入的SRS文档路径(支持.docx和.pdf)'
|
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
parser.add_argument(
|
|
|
|
|
|
'--output', '-o',
|
|
|
|
|
|
type=str,
|
|
|
|
|
|
default='output.json',
|
|
|
|
|
|
help='输出JSON文件路径(默认:output.json)'
|
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
parser.add_argument(
|
|
|
|
|
|
'--config', '-c',
|
|
|
|
|
|
type=str,
|
|
|
|
|
|
default=None,
|
|
|
|
|
|
help='配置文件路径(默认:./config.yaml)'
|
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
parser.add_argument(
|
|
|
|
|
|
'--verbose', '-v',
|
|
|
|
|
|
action='store_true',
|
|
|
|
|
|
help='输出详细日志'
|
|
|
|
|
|
)
|
2026-04-18 20:33:58 +08:00
|
|
|
|
|
2026-02-03 22:48:22 +08:00
|
|
|
|
parser.add_argument(
|
2026-04-18 20:33:58 +08:00
|
|
|
|
'--chapters',
|
|
|
|
|
|
type=str,
|
|
|
|
|
|
default=None,
|
|
|
|
|
|
help='按章节提取(如: 3 或 3,4.1);输入3表示提取第3章及其子章节'
|
2026-02-03 22:48:22 +08:00
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
# 解析命令行参数
|
|
|
|
|
|
args = parser.parse_args()
|
|
|
|
|
|
|
|
|
|
|
|
# 加载配置
|
|
|
|
|
|
config = load_config(args.config)
|
|
|
|
|
|
|
|
|
|
|
|
# 设置日志
|
|
|
|
|
|
if args.verbose:
|
|
|
|
|
|
config.setdefault('logging', {})['level'] = 'DEBUG'
|
|
|
|
|
|
setup_logging(config)
|
|
|
|
|
|
|
|
|
|
|
|
logger.info("=" * 60)
|
|
|
|
|
|
logger.info("SRS需求文档解析工具启动(LLM增强版)")
|
|
|
|
|
|
logger.info("=" * 60)
|
|
|
|
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
|
# 验证输入文件
|
|
|
|
|
|
if not validate_file_path(args.input, ['.pdf', '.docx']):
|
|
|
|
|
|
logger.error(f"输入文件验证失败: {args.input}")
|
|
|
|
|
|
return False
|
|
|
|
|
|
|
|
|
|
|
|
logger.info(f"输入文件: {args.input}")
|
|
|
|
|
|
|
|
|
|
|
|
# 创建输出目录
|
|
|
|
|
|
output_dir = os.path.dirname(args.output) or '.'
|
|
|
|
|
|
if output_dir != '.' and not ensure_directory_exists(output_dir):
|
|
|
|
|
|
logger.error(f"无法创建输出目录: {output_dir}")
|
|
|
|
|
|
return False
|
|
|
|
|
|
|
|
|
|
|
|
logger.info(f"输出文件: {args.output}")
|
|
|
|
|
|
|
2026-04-18 20:33:58 +08:00
|
|
|
|
# 创建LLM实例(必需)
|
2026-02-03 22:48:22 +08:00
|
|
|
|
llm = create_llm(config)
|
2026-04-18 20:33:58 +08:00
|
|
|
|
logger.info("LLM增强模式已启用")
|
2026-02-03 22:48:22 +08:00
|
|
|
|
|
|
|
|
|
|
# 步骤1:解析文档
|
|
|
|
|
|
logger.info("\n" + "=" * 60)
|
|
|
|
|
|
logger.info("步骤1:解析文档")
|
|
|
|
|
|
logger.info("=" * 60)
|
|
|
|
|
|
|
|
|
|
|
|
doc_parser = create_parser(args.input)
|
|
|
|
|
|
if llm:
|
|
|
|
|
|
doc_parser.set_llm(llm)
|
|
|
|
|
|
|
|
|
|
|
|
sections = doc_parser.parse()
|
|
|
|
|
|
document_title = doc_parser.get_document_title()
|
2026-04-18 20:33:58 +08:00
|
|
|
|
|
|
|
|
|
|
selected_chapters = parse_chapter_selector(args.chapters) if args.chapters else []
|
|
|
|
|
|
if selected_chapters:
|
|
|
|
|
|
sections = filter_sections_by_chapters(sections, selected_chapters)
|
|
|
|
|
|
if not sections:
|
|
|
|
|
|
raise ValueError(f"未匹配到指定章节: {', '.join(selected_chapters)}")
|
|
|
|
|
|
logger.info(f"章节筛选已启用: {', '.join(selected_chapters)}")
|
2026-02-03 22:48:22 +08:00
|
|
|
|
|
|
|
|
|
|
logger.info(f"成功解析文档,提取{len(sections)}个顶级章节")
|
|
|
|
|
|
|
|
|
|
|
|
# 打印章节结构
|
|
|
|
|
|
def print_sections(sections, indent=0):
|
|
|
|
|
|
for section in sections:
|
|
|
|
|
|
logger.info(" " * indent + f"- {section.number} {section.title}")
|
|
|
|
|
|
if section.children:
|
|
|
|
|
|
print_sections(section.children, indent + 1)
|
|
|
|
|
|
|
|
|
|
|
|
if args.verbose:
|
|
|
|
|
|
logger.info("章节结构:")
|
|
|
|
|
|
print_sections(sections)
|
|
|
|
|
|
|
|
|
|
|
|
# 步骤2:提取需求
|
|
|
|
|
|
logger.info("\n" + "=" * 60)
|
2026-04-18 20:33:58 +08:00
|
|
|
|
logger.info("步骤2:提取需求(LLM增强模式)")
|
2026-02-03 22:48:22 +08:00
|
|
|
|
logger.info("=" * 60)
|
|
|
|
|
|
|
|
|
|
|
|
extractor = RequirementExtractor(config, llm=llm)
|
|
|
|
|
|
requirements = extractor.extract_from_sections(sections)
|
|
|
|
|
|
|
|
|
|
|
|
# 统计需求信息
|
|
|
|
|
|
stats = extractor.get_statistics()
|
|
|
|
|
|
logger.info(f"\n需求统计:")
|
|
|
|
|
|
for req_type, count in stats['by_type'].items():
|
|
|
|
|
|
logger.info(f" {req_type}: {count}项")
|
|
|
|
|
|
logger.info(f" 总计: {stats['total']}项")
|
|
|
|
|
|
|
|
|
|
|
|
# 步骤3:生成JSON
|
|
|
|
|
|
logger.info("\n" + "=" * 60)
|
|
|
|
|
|
logger.info("步骤3:生成JSON")
|
|
|
|
|
|
logger.info("=" * 60)
|
|
|
|
|
|
|
|
|
|
|
|
generator = JSONGenerator(config)
|
|
|
|
|
|
json_output = generator.generate(
|
|
|
|
|
|
sections,
|
|
|
|
|
|
requirements,
|
|
|
|
|
|
document_title
|
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
logger.info(f"JSON结构生成完成")
|
|
|
|
|
|
|
|
|
|
|
|
# 步骤4:保存文件
|
|
|
|
|
|
logger.info("\n" + "=" * 60)
|
|
|
|
|
|
logger.info("步骤4:保存结果")
|
|
|
|
|
|
logger.info("=" * 60)
|
|
|
|
|
|
|
|
|
|
|
|
generator.save_to_file(json_output, args.output)
|
|
|
|
|
|
logger.info(f"成功保存JSON文件到: {args.output}")
|
|
|
|
|
|
|
|
|
|
|
|
# 打印输出文件大小
|
|
|
|
|
|
if os.path.exists(args.output):
|
|
|
|
|
|
file_size = os.path.getsize(args.output)
|
|
|
|
|
|
logger.info(f"文件大小: {file_size} 字节")
|
|
|
|
|
|
|
|
|
|
|
|
logger.info("\n" + "=" * 60)
|
|
|
|
|
|
logger.info("SRS需求文档解析完成!")
|
|
|
|
|
|
logger.info("=" * 60)
|
|
|
|
|
|
|
|
|
|
|
|
return True
|
|
|
|
|
|
|
|
|
|
|
|
except Exception as e:
|
|
|
|
|
|
logger.error(f"处理过程中出现错误: {e}", exc_info=True)
|
|
|
|
|
|
return False
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
|
|
|
success = main()
|
|
|
|
|
|
sys.exit(0 if success else 1)
|