TBgen_App/data/probset.py

"""
Description :   the class to manage the problem set data
Author      :   Ruidi Qiu (r.qiu@tum.de)
Time        :   2024/3/6 14:01:22
LastEdited  :   2024/8/13 00:56:57
"""

if __name__ == "__main__":
    import sys
    sys.path.append(".") # add the root folder to the python path
from copy import deepcopy
import loader_saver as ls
from config import Config

def main():
    # test the class
    HDLBITS_DATA_PATH = "data/HDLBits/HDLBits_data.jsonl"
    CIRCUIT_TYPE_PATH = "data/HDLBits/HDLBits_circuit_type.jsonl"
    probset = HDLBitsProbset(HDLBITS_DATA_PATH, circuit_type_path=CIRCUIT_TYPE_PATH, only_tasks=["rule110"])
    print(probset.num)
    print(probset.data[0])

class dictlist:
    """
    - a class to manage the list of dict
    - form:

    {
        id_key: "xxx", #the key to identify the dict
        content1: xxx,
        content2: xxx,
        ...
    }
    """
    def __init__(self, id_key:str, path:str=None, moreinfo_path_list:list=[], only:list=None, exclude:list=[], filter:dict={}):
        """
        if path is None, will not load any data
        """
        self.id_key = id_key
        if path is not None:
            try:
                self.data = ls.load_json_lines(path)
            except:
                self.data = ls.load_json_dict(path)
            if moreinfo_path_list != []:
                try:
                    moreinfo = [ls.load_json_lines(moreinfo_path) for moreinfo_path in moreinfo_path_list]
                except:
                    moreinfo = [ls.load_json_dict(moreinfo_path) for moreinfo_path in moreinfo_path_list]
                for info in moreinfo:
                    self.merge(info)
            self.filter(filter)
            self.del_items(only, del_by_list=False)
            self.del_items(exclude)
        else:
            self.data = []

    @property
    def num(self):
        return len(self.data)

    def data_clean(self, only=None, exclude=[], filter={}):
        self.del_items(only, del_by_list=False)
        self.del_items(exclude)
        self.filter(filter)

    def find_data_by_id(self, id):
        for prob_data in self.data:
            if prob_data[self.id_key] == id:
                return prob_data
        raise ValueError("Cannot find the problem infomation with %s: "%(self.id_key) + id + ".")

    def merge(self, additional_data):
        """merge additional data into the original data"""
        for data in self.data:
            for add_data in additional_data:
                if data[self.id_key] == add_data[self.id_key]:
                    for key, value in add_data.items():
                        if key != self.id_key:
                            data[key] = value

    def filter(self, filter_dict, del_en=True):
        """
        #### Function
        - filtering the data by the key and value.
        - only the data that has the key and value will remain
        - the output will always be the filtered data, but I recommend to directly use `self.data` to get the filtered data if del_en is True
        #### Input
        - filter_dict: dict; the key and value to filter the data
        - del_en: bool; if True, the data that doesn't have the key and value will be deleted from the data. If False, the data will not change but output the filtered data
        """
        if del_en:
            for key, value in filter_dict.items():
                self.data = [prob_data for prob_data in self.data if prob_data.get(key) == value]
        else:
            filtered_data = deepcopy(self.data)
            for key, value in filter_dict.items():
                filtered_data = [prob_data for prob_data in filtered_data if prob_data.get(key) == value]
            return filtered_data

    def del_items(self, id_list, del_by_list=True):
        """
        - id_list: list of ids
        - del_by_list: bool; if True, data having the task_id in the list will be deleted. If False, the data that doesn't have the task_id in the list will be deleted
        """
        # avoid default list = [] and del_by_list = False to del all the data
        if id_list is not None and id_list != []:
            if del_by_list:
                self.data = [prob_data for prob_data in self.data if prob_data[self.id_key] not in id_list]
            else: # del the data that doesn't have the task_id in the list
                self.data = [prob_data for prob_data in self.data if prob_data[self.id_key] in id_list]

class muti_dictlist:
    """
    - mutiple dictlists, can perform the same operation on all the dictlists
    - self.dictlists: a list of dictlist
    """
    def __init__(self, id_key:str, path_list:list=None, moreinfo_path_list:list=[], only:list=None, exclude:list=[], filter:dict={}):
        """
        you can only determing the id_key (mostly, "task_id"); the dictlists can be added later
        """
        self.dictlists = []
        self.id_key = id_key
        if path_list is not None:
            self.load_dictlists(id_key, path_list, moreinfo_path_list, only, exclude, filter)

    def load_dictlists(self, id_key, path_list, moreinfo_path_list=[], only=None, exclude=[], filter={}):
        self.dictlists = [dictlist(id_key, path, moreinfo_path_list, only, exclude, filter) for path in path_list]

    def load_dictlist(self, id_key, path, moreinfo_path_list=[], only=None, exclude=[], filter={}):
        self.dictlists.append(dictlist(id_key, path, moreinfo_path_list, only, exclude, filter))

    def data_clean(self, only=None, exclude=[], filter={}):
        for dictlist in self.dictlists:
            dictlist.data_clean(only, exclude, filter)

    def merge(self, additional_data):
        """merge additional data into the original data"""
        for dictlist in self.dictlists:
            dictlist.merge(additional_data)

    def filter(self, filter_dict, del_en=True):
        """
        #### Function
        - filtering the data by the key and value.
        - only the data that has the key and value will remain
        - the output will always be the filtered data, but I recommend to directly use `self.data` to get the filtered data if del_en is True
        #### Input
        - filter_dict: dict; the key and value to filter the data
        - del_en: bool; if True, the data that doesn't have the key and value will be deleted from the data. If False, the data will not change but output the filtered data
        """
        for dictlist in self.dictlists:
            dictlist.filter(filter_dict, del_en)

    def del_items(self, id_list, del_by_list=True):
        """
        - id_list: list of ids
        - del_by_list: bool; if True, data having the task_id in the list will be deleted. If False, the data that doesn't have the task_id in the list will be deleted
        """
        for dictlist in self.dictlists:
            dictlist.del_items(id_list, del_by_list)

    def do(self, func:str, *args, **kwargs):
        """
        this function will perform the function `func` on all the dictlists
        for example, if you want to delete the data with task_id in the list, you can use `do("del_items", task_id_list)`
        """
        result_list = []
        for dictlist in self.dictlists:
            try:
                result_list.append(getattr(dictlist, func)(*args, **kwargs))
            except AttributeError:
                print("The function '%s' is not in dictlist"%func)
        return result_list

    def access(self, attr:str):
        """
        return the list of the attribute of the dictlist
        """
        return [getattr(dictlist, attr) for dictlist in self.dictlists]

    def all_equal(self, attr:str):
        """
        return True if all the attribute of the dictlist are the same
        """
        attr_list = self.access(attr)
        return all(attr == attr_list[0] for attr in attr_list)

    @property
    def num(self):
        return [dictlist.num for dictlist in self.dictlists]

    @property
    def datasets(self):
        return self.dictlists

    @property
    def groups(self):
        return self.dictlists


class HDLBitsProbset(dictlist):

    """ has many similarities with HDLBitsData in HDLBits_data_manager.py"""
    def __init__(self, path:str=None, more_info_paths:list=[], only_tasks:list=None, exclude_tasks:list=[], filter_content:dict={}):
        super().__init__("task_id", path=path, moreinfo_path_list=more_info_paths, only=only_tasks, exclude=exclude_tasks, filter=filter_content)

    @property
    def task_id_list(self):
        """
        return a list of task_id
        """
        return [i["task_id"] for i in self.data]

    def create_empty_set_via_taskids(self, task_id_list):
        """
        return a dictlist with only the task_id in the task_id_list
        """
        self.data = [{"task_id": i} for i in task_id_list]

    def access_data_via_taskid(self, task_id):
        """
        return a dict in all the information of the task_id
        """
        for i in self.data:
            if i["task_id"] == task_id:
                return i
        raise ValueError("task_id %s not found!!!" % (task_id))

    def load_by_config(self, probset_config:Config):
        """
        - if use this, please leave path as Empty when initializing
        - probset_config = config.autoline.probset
        """
        paras = {
            "path": probset_config.path,
            "more_info_paths": probset_config.more_info_paths,
            "only_tasks": probset_config.only,
            "exclude_tasks": probset_config.exclude,
            "filter_content": probset_config.filter[0]
        }
        if probset_config.gptgenRTL_path is not None:
            paras["more_info_paths"].append(probset_config.gptgenRTL_path)
        if probset_config.mutant_path is not None:
            paras["more_info_paths"].append(probset_config.mutant_path)
        self.__init__(**paras)
        if probset_config.exclude_json is not None:
            if type(probset_config.exclude_json) == str:
                exclude_tasks = HDLBitsProbset()
                exclude_tasks.data = ls.load_json_dict(probset_config.exclude_json)
                exclude_task_id_list = exclude_tasks.task_id_list
                self.del_items(exclude_task_id_list, del_by_list=True)
            elif type(probset_config.exclude_json) == list:
                for exclude_json in probset_config.exclude_json:
                    exclude_tasks = HDLBitsProbset()
                    exclude_tasks.data = ls.load_json_dict(exclude_json)
                    exclude_task_id_list = exclude_tasks.task_id_list
                    self.del_items(exclude_task_id_list, del_by_list=True)

    def random_task(self):
        """
        return a random task
        """
        import random
        return random.choice(self.data)

if __name__ == "__main__":
    main() # run the main function