add: implement results analysis and grading functionality
This commit is contained in:
parent
9fd769e5b9
commit
27e6dd6653
1 changed files with 122 additions and 0 deletions
122
tests/results.py
Normal file
122
tests/results.py
Normal file
|
@ -0,0 +1,122 @@
|
|||
'''
|
||||
|
||||
'''
|
||||
|
||||
import json;
|
||||
import os;
|
||||
import sys;
|
||||
import pandas;
|
||||
import re;
|
||||
|
||||
class ResultsAnalysis:
|
||||
results = {};
|
||||
output = {};
|
||||
imports = {};
|
||||
|
||||
'''
|
||||
Get the final verdicts by the LLM.
|
||||
|
||||
Returns: (dict) the verdicts
|
||||
'''
|
||||
def analyse(self):
|
||||
self.results['verdicts'] = {};
|
||||
|
||||
for TYPE_NAME in list(self.imports['data'].keys()):
|
||||
self.results['verdicts'][TYPE_NAME] = {};
|
||||
|
||||
# Loop through each source
|
||||
for SOURCE_NAME in list(self.imports['data'][TYPE_NAME].keys()):
|
||||
self.results['verdicts'][TYPE_NAME][SOURCE_NAME] = pandas.DataFrame(columns=list(self.imports['models'].keys()));
|
||||
|
||||
# Get the response of each LLM.
|
||||
for RESPONSES_ALL in self.imports['data'][TYPE_NAME][SOURCE_NAME]:
|
||||
RESPONSES_ROW = [];
|
||||
|
||||
for RESPONSES in RESPONSES_ALL.values():
|
||||
# Split each fragment.
|
||||
LASTLINE_FRAGMENTS = re.split(r'[-/ \s]+', re.sub('\t', ' ', RESPONSES[-1]).lower().strip());
|
||||
|
||||
# Flag
|
||||
VALID = False;
|
||||
|
||||
if ('maybe' in LASTLINE_FRAGMENTS or 'or' in LASTLINE_FRAGMENTS):
|
||||
RESPONSES_ROW.append('');
|
||||
else:
|
||||
while (len(LASTLINE_FRAGMENTS) and not(VALID)):
|
||||
FRAGMENT = LASTLINE_FRAGMENTS.pop(0);
|
||||
|
||||
if (FRAGMENT in ['human', 'professional', 'expert', 'ai', 'llm', 'model']):
|
||||
VALID = True;
|
||||
|
||||
if (FRAGMENT in ['human', 'professional', 'expert']):
|
||||
RESPONSES_ROW.append('human');
|
||||
elif (FRAGMENT in ['ai', 'llm', 'model']):
|
||||
RESPONSES_ROW.append('AI');
|
||||
|
||||
|
||||
if (not(VALID)):
|
||||
RESPONSES_ROW.append(False);
|
||||
|
||||
self.results['verdicts'][TYPE_NAME][SOURCE_NAME].loc[len(self.results['verdicts'][TYPE_NAME][SOURCE_NAME])] = RESPONSES_ROW;
|
||||
return (self.results['verdicts']);
|
||||
|
||||
'''
|
||||
Determine the accuracy of the responses.
|
||||
|
||||
Parameters:
|
||||
RESPONSES (dict): The expected responses and the actual responses
|
||||
Returns: (dict) the accuracy
|
||||
'''
|
||||
def grade(self):
|
||||
|
||||
self.results['accuracy'] = pandas.DataFrame(columns=['model name', 'dataname', 'expected', 'human', 'AI', 'unsure', 'invalid']);
|
||||
|
||||
for MODEL_NAME in list(self.imports['models'].keys()):
|
||||
for AUTHOR_TYPE in list(self.results['verdicts']):
|
||||
for DATANAME in list(self.results['verdicts'][AUTHOR_TYPE]):
|
||||
ROW = [MODEL_NAME, DATANAME, AUTHOR_TYPE, self.results['verdicts'][AUTHOR_TYPE][DATANAME][MODEL_NAME].value_counts().get('human', 0), self.results['verdicts'][AUTHOR_TYPE][DATANAME][MODEL_NAME].value_counts().get('AI', 0), self.results['verdicts'][AUTHOR_TYPE][DATANAME][MODEL_NAME].value_counts().get('', 0), self.results['verdicts'][AUTHOR_TYPE][DATANAME][MODEL_NAME].value_counts().get(False, 0)];
|
||||
|
||||
self.results['accuracy'].loc[len(self.results['accuracy'])] = ROW;
|
||||
|
||||
return (self.results['accuracy']);
|
||||
|
||||
def export(self, PATH):
|
||||
self.results['accuracy'].to_csv(PATH);
|
||||
|
||||
'''
|
||||
Import the necessary files.
|
||||
|
||||
Parameters:
|
||||
PARAMETERS: the input files
|
||||
'''
|
||||
def __import(self, PARAMETERS):
|
||||
CONFIG = {'data': 1, 'models': 2};
|
||||
|
||||
for NAME in list(CONFIG.keys()):
|
||||
self.imports[NAME] = json.load(open(PARAMETERS[CONFIG[NAME]]));
|
||||
|
||||
'''
|
||||
Begin the results analysis.
|
||||
|
||||
Parameters:
|
||||
PARAMETERS: launch parameters
|
||||
'''
|
||||
def __init__(self, PARAMETERS):
|
||||
if (not(len(PARAMETERS) == 4)):
|
||||
raise ImportError("You are only allowed to have the following (in order): \n\t1.\tresponses file\n\t2.\tthe models file, and\n\t3.\tthe output file. ");
|
||||
|
||||
# Select the paths.
|
||||
self.__import(PARAMETERS);
|
||||
self.output['path'] = PARAMETERS[3];
|
||||
|
||||
# Analyse the responses.
|
||||
self.analyse();
|
||||
self.grade();
|
||||
|
||||
print(self.results['accuracy']);
|
||||
|
||||
# Export the responses.
|
||||
self.export(self.output['path']);
|
||||
|
||||
if (__name__ == "__main__"):
|
||||
ResultsAnalysis(sys.argv);
|
Loading…
Add table
Add a link
Reference in a new issue