diff --git a/tests/results.py b/tests/results.py new file mode 100644 index 0000000..b01123b --- /dev/null +++ b/tests/results.py @@ -0,0 +1,122 @@ +''' + +''' + +import json; +import os; +import sys; +import pandas; +import re; + +class ResultsAnalysis: + results = {}; + output = {}; + imports = {}; + + ''' + Get the final verdicts by the LLM. + + Returns: (dict) the verdicts + ''' + def analyse(self): + self.results['verdicts'] = {}; + + for TYPE_NAME in list(self.imports['data'].keys()): + self.results['verdicts'][TYPE_NAME] = {}; + + # Loop through each source + for SOURCE_NAME in list(self.imports['data'][TYPE_NAME].keys()): + self.results['verdicts'][TYPE_NAME][SOURCE_NAME] = pandas.DataFrame(columns=list(self.imports['models'].keys())); + + # Get the response of each LLM. + for RESPONSES_ALL in self.imports['data'][TYPE_NAME][SOURCE_NAME]: + RESPONSES_ROW = []; + + for RESPONSES in RESPONSES_ALL.values(): + # Split each fragment. + LASTLINE_FRAGMENTS = re.split(r'[-/ \s]+', re.sub('\t', ' ', RESPONSES[-1]).lower().strip()); + + # Flag + VALID = False; + + if ('maybe' in LASTLINE_FRAGMENTS or 'or' in LASTLINE_FRAGMENTS): + RESPONSES_ROW.append(''); + else: + while (len(LASTLINE_FRAGMENTS) and not(VALID)): + FRAGMENT = LASTLINE_FRAGMENTS.pop(0); + + if (FRAGMENT in ['human', 'professional', 'expert', 'ai', 'llm', 'model']): + VALID = True; + + if (FRAGMENT in ['human', 'professional', 'expert']): + RESPONSES_ROW.append('human'); + elif (FRAGMENT in ['ai', 'llm', 'model']): + RESPONSES_ROW.append('AI'); + + + if (not(VALID)): + RESPONSES_ROW.append(False); + + self.results['verdicts'][TYPE_NAME][SOURCE_NAME].loc[len(self.results['verdicts'][TYPE_NAME][SOURCE_NAME])] = RESPONSES_ROW; + return (self.results['verdicts']); + + ''' + Determine the accuracy of the responses. + + Parameters: + RESPONSES (dict): The expected responses and the actual responses + Returns: (dict) the accuracy + ''' + def grade(self): + + self.results['accuracy'] = pandas.DataFrame(columns=['model name', 'dataname', 'expected', 'human', 'AI', 'unsure', 'invalid']); + + for MODEL_NAME in list(self.imports['models'].keys()): + for AUTHOR_TYPE in list(self.results['verdicts']): + for DATANAME in list(self.results['verdicts'][AUTHOR_TYPE]): + ROW = [MODEL_NAME, DATANAME, AUTHOR_TYPE, self.results['verdicts'][AUTHOR_TYPE][DATANAME][MODEL_NAME].value_counts().get('human', 0), self.results['verdicts'][AUTHOR_TYPE][DATANAME][MODEL_NAME].value_counts().get('AI', 0), self.results['verdicts'][AUTHOR_TYPE][DATANAME][MODEL_NAME].value_counts().get('', 0), self.results['verdicts'][AUTHOR_TYPE][DATANAME][MODEL_NAME].value_counts().get(False, 0)]; + + self.results['accuracy'].loc[len(self.results['accuracy'])] = ROW; + + return (self.results['accuracy']); + + def export(self, PATH): + self.results['accuracy'].to_csv(PATH); + + ''' + Import the necessary files. + + Parameters: + PARAMETERS: the input files + ''' + def __import(self, PARAMETERS): + CONFIG = {'data': 1, 'models': 2}; + + for NAME in list(CONFIG.keys()): + self.imports[NAME] = json.load(open(PARAMETERS[CONFIG[NAME]])); + + ''' + Begin the results analysis. + + Parameters: + PARAMETERS: launch parameters + ''' + def __init__(self, PARAMETERS): + if (not(len(PARAMETERS) == 4)): + raise ImportError("You are only allowed to have the following (in order): \n\t1.\tresponses file\n\t2.\tthe models file, and\n\t3.\tthe output file. "); + + # Select the paths. + self.__import(PARAMETERS); + self.output['path'] = PARAMETERS[3]; + + # Analyse the responses. + self.analyse(); + self.grade(); + + print(self.results['accuracy']); + + # Export the responses. + self.export(self.output['path']); + +if (__name__ == "__main__"): + ResultsAnalysis(sys.argv);