122 lines
3.6 KiB
Python
122 lines
3.6 KiB
Python
'''
|
|
|
|
'''
|
|
|
|
import json;
|
|
import os;
|
|
import sys;
|
|
import pandas;
|
|
import re;
|
|
|
|
class ResultsAnalysis:
|
|
results = {};
|
|
output = {};
|
|
imports = {};
|
|
|
|
'''
|
|
Get the final verdicts by the LLM.
|
|
|
|
Returns: (dict) the verdicts
|
|
'''
|
|
def analyse(self):
|
|
self.results['verdicts'] = {};
|
|
|
|
for TYPE_NAME in list(self.imports['data'].keys()):
|
|
self.results['verdicts'][TYPE_NAME] = {};
|
|
|
|
# Loop through each source
|
|
for SOURCE_NAME in list(self.imports['data'][TYPE_NAME].keys()):
|
|
self.results['verdicts'][TYPE_NAME][SOURCE_NAME] = pandas.DataFrame(columns=list(self.imports['models'].keys()));
|
|
|
|
# Get the response of each LLM.
|
|
for RESPONSES_ALL in self.imports['data'][TYPE_NAME][SOURCE_NAME]:
|
|
RESPONSES_ROW = [];
|
|
|
|
for RESPONSES in RESPONSES_ALL.values():
|
|
# Split each fragment.
|
|
LASTLINE_FRAGMENTS = re.split(r'[-/ \s]+', re.sub('\t', ' ', RESPONSES[-1]).lower().strip());
|
|
|
|
# Flag
|
|
VALID = False;
|
|
|
|
if ('maybe' in LASTLINE_FRAGMENTS or 'or' in LASTLINE_FRAGMENTS):
|
|
RESPONSES_ROW.append('');
|
|
else:
|
|
while (len(LASTLINE_FRAGMENTS) and not(VALID)):
|
|
FRAGMENT = LASTLINE_FRAGMENTS.pop(0);
|
|
|
|
if (FRAGMENT in ['human', 'professional', 'expert', 'ai', 'llm', 'model']):
|
|
VALID = True;
|
|
|
|
if (FRAGMENT in ['human', 'professional', 'expert']):
|
|
RESPONSES_ROW.append('human');
|
|
elif (FRAGMENT in ['ai', 'llm', 'model']):
|
|
RESPONSES_ROW.append('AI');
|
|
|
|
|
|
if (not(VALID)):
|
|
RESPONSES_ROW.append(False);
|
|
|
|
self.results['verdicts'][TYPE_NAME][SOURCE_NAME].loc[len(self.results['verdicts'][TYPE_NAME][SOURCE_NAME])] = RESPONSES_ROW;
|
|
return (self.results['verdicts']);
|
|
|
|
'''
|
|
Determine the accuracy of the responses.
|
|
|
|
Parameters:
|
|
RESPONSES (dict): The expected responses and the actual responses
|
|
Returns: (dict) the accuracy
|
|
'''
|
|
def grade(self):
|
|
|
|
self.results['accuracy'] = pandas.DataFrame(columns=['model name', 'dataname', 'expected', 'human', 'AI', 'unsure', 'invalid']);
|
|
|
|
for MODEL_NAME in list(self.imports['models'].keys()):
|
|
for AUTHOR_TYPE in list(self.results['verdicts']):
|
|
for DATANAME in list(self.results['verdicts'][AUTHOR_TYPE]):
|
|
ROW = [MODEL_NAME, DATANAME, AUTHOR_TYPE, self.results['verdicts'][AUTHOR_TYPE][DATANAME][MODEL_NAME].value_counts().get('human', 0), self.results['verdicts'][AUTHOR_TYPE][DATANAME][MODEL_NAME].value_counts().get('AI', 0), self.results['verdicts'][AUTHOR_TYPE][DATANAME][MODEL_NAME].value_counts().get('', 0), self.results['verdicts'][AUTHOR_TYPE][DATANAME][MODEL_NAME].value_counts().get(False, 0)];
|
|
|
|
self.results['accuracy'].loc[len(self.results['accuracy'])] = ROW;
|
|
|
|
return (self.results['accuracy']);
|
|
|
|
def export(self, PATH):
|
|
self.results['accuracy'].to_csv(PATH);
|
|
|
|
'''
|
|
Import the necessary files.
|
|
|
|
Parameters:
|
|
PARAMETERS: the input files
|
|
'''
|
|
def __import(self, PARAMETERS):
|
|
CONFIG = {'data': 1, 'models': 2};
|
|
|
|
for NAME in list(CONFIG.keys()):
|
|
self.imports[NAME] = json.load(open(PARAMETERS[CONFIG[NAME]]));
|
|
|
|
'''
|
|
Begin the results analysis.
|
|
|
|
Parameters:
|
|
PARAMETERS: launch parameters
|
|
'''
|
|
def __init__(self, PARAMETERS):
|
|
if (not(len(PARAMETERS) == 4)):
|
|
raise ImportError("You are only allowed to have the following (in order): \n\t1.\tresponses file\n\t2.\tthe models file, and\n\t3.\tthe output file. ");
|
|
|
|
# Select the paths.
|
|
self.__import(PARAMETERS);
|
|
self.output['path'] = PARAMETERS[3];
|
|
|
|
# Analyse the responses.
|
|
self.analyse();
|
|
self.grade();
|
|
|
|
print(self.results['accuracy']);
|
|
|
|
# Export the responses.
|
|
self.export(self.output['path']);
|
|
|
|
if (__name__ == "__main__"):
|
|
ResultsAnalysis(sys.argv);
|