LLM-Self-Detection-Research/tests/results.py
2024-12-07 23:43:42 +08:00

122 lines
3.6 KiB
Python

'''
'''
import json;
import os;
import sys;
import pandas;
import re;
class ResultsAnalysis:
results = {};
output = {};
imports = {};
'''
Get the final verdicts by the LLM.
Returns: (dict) the verdicts
'''
def analyse(self):
self.results['verdicts'] = {};
for TYPE_NAME in list(self.imports['data'].keys()):
self.results['verdicts'][TYPE_NAME] = {};
# Loop through each source
for SOURCE_NAME in list(self.imports['data'][TYPE_NAME].keys()):
self.results['verdicts'][TYPE_NAME][SOURCE_NAME] = pandas.DataFrame(columns=list(self.imports['models'].keys()));
# Get the response of each LLM.
for RESPONSES_ALL in self.imports['data'][TYPE_NAME][SOURCE_NAME]:
RESPONSES_ROW = [];
for RESPONSES in RESPONSES_ALL.values():
# Split each fragment.
LASTLINE_FRAGMENTS = re.split(r'[-/ \s]+', re.sub('\t', ' ', RESPONSES[-1]).lower().strip());
# Flag
VALID = False;
if ('maybe' in LASTLINE_FRAGMENTS or 'or' in LASTLINE_FRAGMENTS):
RESPONSES_ROW.append('');
else:
while (len(LASTLINE_FRAGMENTS) and not(VALID)):
FRAGMENT = LASTLINE_FRAGMENTS.pop(0);
if (FRAGMENT in ['human', 'professional', 'expert', 'ai', 'llm', 'model']):
VALID = True;
if (FRAGMENT in ['human', 'professional', 'expert']):
RESPONSES_ROW.append('human');
elif (FRAGMENT in ['ai', 'llm', 'model']):
RESPONSES_ROW.append('AI');
if (not(VALID)):
RESPONSES_ROW.append(False);
self.results['verdicts'][TYPE_NAME][SOURCE_NAME].loc[len(self.results['verdicts'][TYPE_NAME][SOURCE_NAME])] = RESPONSES_ROW;
return (self.results['verdicts']);
'''
Determine the accuracy of the responses.
Parameters:
RESPONSES (dict): The expected responses and the actual responses
Returns: (dict) the accuracy
'''
def grade(self):
self.results['accuracy'] = pandas.DataFrame(columns=['model name', 'dataname', 'expected', 'human', 'AI', 'unsure', 'invalid']);
for MODEL_NAME in list(self.imports['models'].keys()):
for AUTHOR_TYPE in list(self.results['verdicts']):
for DATANAME in list(self.results['verdicts'][AUTHOR_TYPE]):
ROW = [MODEL_NAME, DATANAME, AUTHOR_TYPE, self.results['verdicts'][AUTHOR_TYPE][DATANAME][MODEL_NAME].value_counts().get('human', 0), self.results['verdicts'][AUTHOR_TYPE][DATANAME][MODEL_NAME].value_counts().get('AI', 0), self.results['verdicts'][AUTHOR_TYPE][DATANAME][MODEL_NAME].value_counts().get('', 0), self.results['verdicts'][AUTHOR_TYPE][DATANAME][MODEL_NAME].value_counts().get(False, 0)];
self.results['accuracy'].loc[len(self.results['accuracy'])] = ROW;
return (self.results['accuracy']);
def export(self, PATH):
self.results['accuracy'].to_csv(PATH);
'''
Import the necessary files.
Parameters:
PARAMETERS: the input files
'''
def __import(self, PARAMETERS):
CONFIG = {'data': 1, 'models': 2};
for NAME in list(CONFIG.keys()):
self.imports[NAME] = json.load(open(PARAMETERS[CONFIG[NAME]]));
'''
Begin the results analysis.
Parameters:
PARAMETERS: launch parameters
'''
def __init__(self, PARAMETERS):
if (not(len(PARAMETERS) == 4)):
raise ImportError("You are only allowed to have the following (in order): \n\t1.\tresponses file\n\t2.\tthe models file, and\n\t3.\tthe output file. ");
# Select the paths.
self.__import(PARAMETERS);
self.output['path'] = PARAMETERS[3];
# Analyse the responses.
self.analyse();
self.grade();
print(self.results['accuracy']);
# Export the responses.
self.export(self.output['path']);
if (__name__ == "__main__"):
ResultsAnalysis(sys.argv);