RAG-TEST-TOOLS/code_parser.py

import os
import re
from pathlib import Path
try:
    import tree_sitter
    import tree_sitter_c
    import tree_sitter_cpp
    from tree_sitter import Language, Parser
except ImportError:  # pragma: no cover - depends on deployment environment
    tree_sitter = None
    tree_sitter_c = None
    tree_sitter_cpp = None
    Language = None
    Parser = None

# 常量定义
CPP_EXTENSIONS = {'.c', '.cpp', '.h', '.hpp', '.tcc'}
IGNORE_DIRS = {'build', 'cmake-build', '.git', 'vendor', 'lib', 'external', 'Debug'}
CPP_LANGUAGE = Language(tree_sitter_cpp.language()) if Language and tree_sitter_cpp else None

class RegexNode:
    def __init__(self, node_type, text="", start_line=0, end_line=0, fields=None, children=None):
        self.type = node_type
        self.text = text.encode("utf-8", errors="ignore") if isinstance(text, str) else text
        self.start_point = (start_line, 0)
        self.end_point = (end_line, 0)
        self._fields = fields or {}
        self.children = children or []
        self.named_children = self.children

    def child_by_field_name(self, name):
        return self._fields.get(name)


class RegexTree:
    def __init__(self, root_node):
        self.root_node = root_node


class RegexParser:
    _signature_re = re.compile(
        r"(?m)^[ \t]*(?:[A-Za-z_][\w\s\*\&:<>]*?[ \t]+)?([A-Za-z_]\w*(?:::[A-Za-z_]\w*)?)"
        r"\s*\([^;{}]*\)\s*\{"
    )
    _call_re = re.compile(r"\b([A-Za-z_]\w*)\s*\(")
    _control_words = {"if", "for", "while", "switch", "return", "sizeof"}

    def parse(self, source_bytes):
        source_text = safe_decode(source_bytes)
        children = self._parse_includes(source_text)
        children.extend(self._parse_functions(source_text))
        return RegexTree(RegexNode("translation_unit", source_text, 0, source_text.count("\n"), children=children))

    def _parse_includes(self, source_text):
        nodes = []
        for match in re.finditer(r"(?m)^\s*#\s*include\s*([<\"][^>\"]+[>\"])", source_text):
            line = source_text.count("\n", 0, match.start())
            path_node = RegexNode("string_literal", match.group(1), line, line)
            nodes.append(
                RegexNode(
                    "preproc_include",
                    match.group(0),
                    line,
                    line,
                    fields={"path": path_node},
                    children=[path_node],
                )
            )
        return nodes

    def _parse_functions(self, source_text):
        nodes = []
        for match in self._signature_re.finditer(source_text):
            name = match.group(1)
            if name in self._control_words:
                continue
            body_start = match.end() - 1
            body_end = self._find_matching_brace(source_text, body_start)
            if body_end <= body_start:
                continue
            start_line = source_text.count("\n", 0, match.start())
            end_line = source_text.count("\n", 0, body_end)
            function_text = source_text[match.start() : body_end + 1]
            body_text = source_text[body_start : body_end + 1]
            identifier = RegexNode("identifier", name, start_line, start_line)
            declarator = RegexNode(
                "function_declarator",
                source_text[match.start() : body_start],
                start_line,
                start_line,
                fields={"declarator": identifier},
                children=[identifier],
            )
            body = RegexNode(
                "compound_statement",
                body_text,
                start_line,
                end_line,
                children=self._parse_calls(body_text, start_line),
            )
            nodes.append(
                RegexNode(
                    "function_definition",
                    function_text,
                    start_line,
                    end_line,
                    fields={"declarator": declarator, "body": body},
                    children=[declarator, body],
                )
            )
        return nodes

    def _find_matching_brace(self, source_text, start_index):
        depth = 0
        for index in range(start_index, len(source_text)):
            char = source_text[index]
            if char == "{":
                depth += 1
            elif char == "}":
                depth -= 1
                if depth == 0:
                    return index
        return -1

    def _parse_calls(self, body_text, start_line):
        calls = []
        for match in self._call_re.finditer(body_text):
            name = match.group(1)
            if name in self._control_words:
                continue
            line = start_line + body_text.count("\n", 0, match.start())
            function_node = RegexNode("identifier", name, line, line)
            calls.append(
                RegexNode(
                    "call_expression",
                    match.group(0),
                    line,
                    line,
                    fields={"function": function_node},
                    children=[function_node],
                )
            )
        return calls


# 全局解析器
if Parser and CPP_LANGUAGE:
    parser = Parser()
    parser.language = CPP_LANGUAGE
else:
    parser = RegexParser()

# 全局图数据结构
FUNCTION_CALL_GRAPH = {}
FILE_DEPENDENCIES = {}
CALLED_BY_GRAPH = {}

def safe_decode(b: bytes) -> str:
    for encoding in ['utf-8', 'gbk', 'gb2312', 'latin-1']:
        try:
            return b.decode(encoding)
        except UnicodeDecodeError:
            continue
    return b.decode('utf-8', errors='replace')

def extract_comment_before(node, source_lines):
    start_line = node.start_point[0]
    comment_lines = []
    for i in range(start_line - 1, max(0, start_line - 10), -1):
        line = safe_decode(source_lines[i]).strip()
        if line.startswith('//') or line.startswith('/*'):
            comment_lines.append(line)
    return " ".join(comment_lines[::-1]) or "无注释"

def extract_code_snippet(tree, node, source_lines):
    start_line = node.start_point[0]
    end_line = node.end_point[0]
    code_lines = []
    for i in range(start_line, end_line + 1):
        if i < len(source_lines):
            code_lines.append(safe_decode(source_lines[i]))
    return "\n".join(code_lines)

def extract_full_function_name(func_def_node):
    declarator = func_def_node.child_by_field_name("declarator")
    if not declarator:
        return "<unknown>"
    if declarator.type == "function_declarator":
        inner_declarator = declarator.child_by_field_name("declarator")
        if not inner_declarator:
            return "<unknown>"
        if inner_declarator.type == "identifier":
            return inner_declarator.text.decode('utf-8')
        elif inner_declarator.type == "field_expression":
            return extract_qualified_name_from_field(inner_declarator)
        else:
            raw = inner_declarator.text.decode('utf-8')
            return raw.split('(')[0].strip()
    elif declarator.type == "identifier":
        return declarator.text.decode('utf-8')
    else:
        raw = declarator.text.decode('utf-8')
        return raw.split('(')[0].strip()

def extract_qualified_name_from_field(field_node):
    def _extract(n):
        if n.type == "identifier":
            return n.text.decode('utf-8')
        elif n.type == "field_expression":
            left = n.child_by_field_name("argument") or (n.children[0] if n.children else None)
            right = n.child_by_field_name("field") or (n.children[-1] if n.children else None)
            left_str = _extract(left) if left else ""
            right_str = _extract(right) if right else ""
            if left_str and right_str:
                return f"{left_str}::{right_str}"
            return left_str or right_str
        elif n.type == "function_declarator":
            inner = n.child_by_field_name("declarator")
            return _extract(inner) if inner else n.text.decode('utf-8')
        else:
            text = n.text.decode('utf-8')
            return text.split('(')[0].strip()
    return _extract(field_node)

def collect_call_expressions(func_ast_node):
    calls = set()
    def traverse_ast(node):
        if node.type == "call_expression":
            func_node = node.child_by_field_name("function")
            if func_node:
                callee_name = _simplified_extract_callee_name(func_node)
                if callee_name != "<unknown>":
                    calls.add(callee_name)
        for child in node.children:
            traverse_ast(child)
    body_node = func_ast_node.child_by_field_name("body")
    if body_node:
        traverse_ast(body_node)
    return list(calls)

def _simplified_extract_callee_name(func_node):
    if not func_node:
        return "<unknown>"
    raw_text = func_node.text.decode('utf-8', errors='ignore')
    name_part = raw_text.split('(')[0].strip()
    name_part = name_part.rstrip('*&')
    for sep in ['::', '->', '.']:
        if sep in name_part:
            parts = name_part.rsplit(sep, 1)
            name_part = parts[-1] if parts else name_part
    if name_part and (name_part[0].isalpha() or name_part[0] == '_'):
        return name_part
    return "<unknown>"

def extract_class_name(class_node):
    name_node = class_node.child_by_field_name("name")
    if name_node and name_node.type == "type_identifier":
        return name_node.text.decode('utf-8')
    for child in class_node.children:
        if child.type == "type_identifier":
            return child.text.decode('utf-8')
    return "<unknown>"

def extract_member_function_name(func_node, parent_class_name):
    declarator = func_node.child_by_field_name("declarator")
    if not declarator:
        return "<unknown>"
    if declarator.type == "function_declarator":
        inner_declarator = declarator.child_by_field_name("declarator")
        if not inner_declarator:
            return "<unknown>"
        if inner_declarator.type == "field_expression":
            return extract_qualified_name_from_field(inner_declarator)
        elif inner_declarator.type == "identifier":
            return inner_declarator.text.decode('utf-8')
    raw_text = declarator.text.decode('utf-8', errors='ignore')
    func_name = raw_text.split('(')[0].strip()
    func_name = func_name.split()[-1] if ' ' in func_name else func_name
    return func_name

def extract_base_classes(base_clause_node):
    base_classes = []
    for child in base_clause_node.named_children:
        if child.type == "base_class_clause":
            type_node = child.child_by_field_name("type")
            if type_node:
                base_name = type_node.text.decode('utf-8')
                if '::' in base_name:
                    base_name = base_name.split('::')[-1]
                access = 'public'
                for subchild in child.children:
                    if subchild.type in ["public", "protected", "private", "virtual"]:
                        access_keyword = subchild.text.decode('utf-8')
                        if access_keyword in ['public', 'protected', 'private']:
                            access = access_keyword
                base_classes.append({
                    'name': base_name,
                    'access': access
                })
    return base_classes
增加代码知识库；修复文档处理内容；增加API设置 2026-05-16 20:20:10 +08:00			`import os`
			`import re`
			`from pathlib import Path`
			`try:`
			`import tree_sitter`
			`import tree_sitter_c`
			`import tree_sitter_cpp`
			`from tree_sitter import Language, Parser`
			`except ImportError: # pragma: no cover - depends on deployment environment`
			`tree_sitter = None`
			`tree_sitter_c = None`
			`tree_sitter_cpp = None`
			`Language = None`
			`Parser = None`

			`# 常量定义`
			`CPP_EXTENSIONS = {'.c', '.cpp', '.h', '.hpp', '.tcc'}`
			`IGNORE_DIRS = {'build', 'cmake-build', '.git', 'vendor', 'lib', 'external', 'Debug'}`
			`CPP_LANGUAGE = Language(tree_sitter_cpp.language()) if Language and tree_sitter_cpp else None`

			`class RegexNode:`
			`def __init__(self, node_type, text="", start_line=0, end_line=0, fields=None, children=None):`
			`self.type = node_type`
			`self.text = text.encode("utf-8", errors="ignore") if isinstance(text, str) else text`
			`self.start_point = (start_line, 0)`
			`self.end_point = (end_line, 0)`
			`self._fields = fields or {}`
			`self.children = children or []`
			`self.named_children = self.children`

			`def child_by_field_name(self, name):`
			`return self._fields.get(name)`


			`class RegexTree:`
			`def __init__(self, root_node):`
			`self.root_node = root_node`


			`class RegexParser:`
			`_signature_re = re.compile(`
			`r"(?m)^[ \t](?:[A-Za-z_][\w\s\\&:<>]?[ \t]+)?([A-Za-z_]\w(?:::[A-Za-z_]\w*)?)"`
			`r"\s\([^;{}]\)\s*\{"`
			`)`
			`_call_re = re.compile(r"\b([A-Za-z_]\w)\s\(")`
			`_control_words = {"if", "for", "while", "switch", "return", "sizeof"}`

			`def parse(self, source_bytes):`
			`source_text = safe_decode(source_bytes)`
			`children = self._parse_includes(source_text)`
			`children.extend(self._parse_functions(source_text))`
			`return RegexTree(RegexNode("translation_unit", source_text, 0, source_text.count("\n"), children=children))`

			`def _parse_includes(self, source_text):`
			`nodes = []`
			`for match in re.finditer(r"(?m)^\s#\sinclude\s*([<\"][^>\"]+[>\"])", source_text):`
			`line = source_text.count("\n", 0, match.start())`
			`path_node = RegexNode("string_literal", match.group(1), line, line)`
			`nodes.append(`
			`RegexNode(`
			`"preproc_include",`
			`match.group(0),`
			`line,`
			`line,`
			`fields={"path": path_node},`
			`children=[path_node],`
			`)`
			`)`
			`return nodes`

			`def _parse_functions(self, source_text):`
			`nodes = []`
			`for match in self._signature_re.finditer(source_text):`
			`name = match.group(1)`
			`if name in self._control_words:`
			`continue`
			`body_start = match.end() - 1`
			`body_end = self._find_matching_brace(source_text, body_start)`
			`if body_end <= body_start:`
			`continue`
			`start_line = source_text.count("\n", 0, match.start())`
			`end_line = source_text.count("\n", 0, body_end)`
			`function_text = source_text[match.start() : body_end + 1]`
			`body_text = source_text[body_start : body_end + 1]`
			`identifier = RegexNode("identifier", name, start_line, start_line)`
			`declarator = RegexNode(`
			`"function_declarator",`
			`source_text[match.start() : body_start],`
			`start_line,`
			`start_line,`
			`fields={"declarator": identifier},`
			`children=[identifier],`
			`)`
			`body = RegexNode(`
			`"compound_statement",`
			`body_text,`
			`start_line,`
			`end_line,`
			`children=self._parse_calls(body_text, start_line),`
			`)`
			`nodes.append(`
			`RegexNode(`
			`"function_definition",`
			`function_text,`
			`start_line,`
			`end_line,`
			`fields={"declarator": declarator, "body": body},`
			`children=[declarator, body],`
			`)`
			`)`
			`return nodes`

			`def _find_matching_brace(self, source_text, start_index):`
			`depth = 0`
			`for index in range(start_index, len(source_text)):`
			`char = source_text[index]`
			`if char == "{":`
			`depth += 1`
			`elif char == "}":`
			`depth -= 1`
			`if depth == 0:`
			`return index`
			`return -1`

			`def _parse_calls(self, body_text, start_line):`
			`calls = []`
			`for match in self._call_re.finditer(body_text):`
			`name = match.group(1)`
			`if name in self._control_words:`
			`continue`
			`line = start_line + body_text.count("\n", 0, match.start())`
			`function_node = RegexNode("identifier", name, line, line)`
			`calls.append(`
			`RegexNode(`
			`"call_expression",`
			`match.group(0),`
			`line,`
			`line,`
			`fields={"function": function_node},`
			`children=[function_node],`
			`)`
			`)`
			`return calls`


			`# 全局解析器`
			`if Parser and CPP_LANGUAGE:`
			`parser = Parser()`
			`parser.language = CPP_LANGUAGE`
			`else:`
			`parser = RegexParser()`

			`# 全局图数据结构`
			`FUNCTION_CALL_GRAPH = {}`
			`FILE_DEPENDENCIES = {}`
			`CALLED_BY_GRAPH = {}`

			`def safe_decode(b: bytes) -> str:`
			`for encoding in ['utf-8', 'gbk', 'gb2312', 'latin-1']:`
			`try:`
			`return b.decode(encoding)`
			`except UnicodeDecodeError:`
			`continue`
			`return b.decode('utf-8', errors='replace')`

			`def extract_comment_before(node, source_lines):`
			`start_line = node.start_point[0]`
			`comment_lines = []`
			`for i in range(start_line - 1, max(0, start_line - 10), -1):`
			`line = safe_decode(source_lines[i]).strip()`
			`if line.startswith('//') or line.startswith('/*'):`
			`comment_lines.append(line)`
			`return " ".join(comment_lines[::-1]) or "无注释"`

			`def extract_code_snippet(tree, node, source_lines):`
			`start_line = node.start_point[0]`
			`end_line = node.end_point[0]`
			`code_lines = []`
			`for i in range(start_line, end_line + 1):`
			`if i < len(source_lines):`
			`code_lines.append(safe_decode(source_lines[i]))`
			`return "\n".join(code_lines)`

			`def extract_full_function_name(func_def_node):`
			`declarator = func_def_node.child_by_field_name("declarator")`
			`if not declarator:`
			`return "<unknown>"`
			`if declarator.type == "function_declarator":`
			`inner_declarator = declarator.child_by_field_name("declarator")`
			`if not inner_declarator:`
			`return "<unknown>"`
			`if inner_declarator.type == "identifier":`
			`return inner_declarator.text.decode('utf-8')`
			`elif inner_declarator.type == "field_expression":`
			`return extract_qualified_name_from_field(inner_declarator)`
			`else:`
			`raw = inner_declarator.text.decode('utf-8')`
			`return raw.split('(')[0].strip()`
			`elif declarator.type == "identifier":`
			`return declarator.text.decode('utf-8')`
			`else:`
			`raw = declarator.text.decode('utf-8')`
			`return raw.split('(')[0].strip()`

			`def extract_qualified_name_from_field(field_node):`
			`def _extract(n):`
			`if n.type == "identifier":`
			`return n.text.decode('utf-8')`
			`elif n.type == "field_expression":`
			`left = n.child_by_field_name("argument") or (n.children[0] if n.children else None)`
			`right = n.child_by_field_name("field") or (n.children[-1] if n.children else None)`
			`left_str = _extract(left) if left else ""`
			`right_str = _extract(right) if right else ""`
			`if left_str and right_str:`
			`return f"{left_str}::{right_str}"`
			`return left_str or right_str`
			`elif n.type == "function_declarator":`
			`inner = n.child_by_field_name("declarator")`
			`return _extract(inner) if inner else n.text.decode('utf-8')`
			`else:`
			`text = n.text.decode('utf-8')`
			`return text.split('(')[0].strip()`
			`return _extract(field_node)`

			`def collect_call_expressions(func_ast_node):`
			`calls = set()`
			`def traverse_ast(node):`
			`if node.type == "call_expression":`
			`func_node = node.child_by_field_name("function")`
			`if func_node:`
			`callee_name = _simplified_extract_callee_name(func_node)`
			`if callee_name != "<unknown>":`
			`calls.add(callee_name)`
			`for child in node.children:`
			`traverse_ast(child)`
			`body_node = func_ast_node.child_by_field_name("body")`
			`if body_node:`
			`traverse_ast(body_node)`
			`return list(calls)`

			`def _simplified_extract_callee_name(func_node):`
			`if not func_node:`
			`return "<unknown>"`
			`raw_text = func_node.text.decode('utf-8', errors='ignore')`
			`name_part = raw_text.split('(')[0].strip()`
			`name_part = name_part.rstrip('*&')`
			`for sep in ['::', '->', '.']:`
			`if sep in name_part:`
			`parts = name_part.rsplit(sep, 1)`
			`name_part = parts[-1] if parts else name_part`
			`if name_part and (name_part[0].isalpha() or name_part[0] == '_'):`
			`return name_part`
			`return "<unknown>"`

			`def extract_class_name(class_node):`
			`name_node = class_node.child_by_field_name("name")`
			`if name_node and name_node.type == "type_identifier":`
			`return name_node.text.decode('utf-8')`
			`for child in class_node.children:`
			`if child.type == "type_identifier":`
			`return child.text.decode('utf-8')`
			`return "<unknown>"`

			`def extract_member_function_name(func_node, parent_class_name):`
			`declarator = func_node.child_by_field_name("declarator")`
			`if not declarator:`
			`return "<unknown>"`
			`if declarator.type == "function_declarator":`
			`inner_declarator = declarator.child_by_field_name("declarator")`
			`if not inner_declarator:`
			`return "<unknown>"`
			`if inner_declarator.type == "field_expression":`
			`return extract_qualified_name_from_field(inner_declarator)`
			`elif inner_declarator.type == "identifier":`
			`return inner_declarator.text.decode('utf-8')`
			`raw_text = declarator.text.decode('utf-8', errors='ignore')`
			`func_name = raw_text.split('(')[0].strip()`
			`func_name = func_name.split()[-1] if ' ' in func_name else func_name`
			`return func_name`

			`def extract_base_classes(base_clause_node):`
			`base_classes = []`
			`for child in base_clause_node.named_children:`
			`if child.type == "base_class_clause":`
			`type_node = child.child_by_field_name("type")`
			`if type_node:`
			`base_name = type_node.text.decode('utf-8')`
			`if '::' in base_name:`
			`base_name = base_name.split('::')[-1]`
			`access = 'public'`
			`for subchild in child.children:`
			`if subchild.type in ["public", "protected", "private", "virtual"]:`
			`access_keyword = subchild.text.decode('utf-8')`
			`if access_keyword in ['public', 'protected', 'private']:`
			`access = access_keyword`
			`base_classes.append({`
			`'name': base_name,`
			`'access': access`
			`})`
			`return base_classes`