LLM-Self-Detection-Research/tests/results.py

'''

'''

import json;
import os;
import sys;
import pandas;
import re;

class ResultsAnalysis:
	results = {};
	output = {};
	imports = {};

	'''
	Get the final verdicts by the LLM.

	Returns: (dict) the verdicts
	'''
	def analyse(self):
		self.results['verdicts'] = {};

		for TYPE_NAME in list(self.imports['data'].keys()):
			self.results['verdicts'][TYPE_NAME] = {};

			# Loop through each source
			for SOURCE_NAME in list(self.imports['data'][TYPE_NAME].keys()):
				self.results['verdicts'][TYPE_NAME][SOURCE_NAME] = pandas.DataFrame(columns=list(self.imports['models'].keys()));

				# Get the response of each LLM.
				for RESPONSES_ALL in self.imports['data'][TYPE_NAME][SOURCE_NAME]:
					RESPONSES_ROW = [];

					for RESPONSES in RESPONSES_ALL.values():
						# Split each fragment.
						LASTLINE_FRAGMENTS = re.split(r'[-/ \s]+', re.sub('\t', ' ', RESPONSES[-1]).lower().strip());

						# Flag
						VALID = False;

						if ('maybe' in LASTLINE_FRAGMENTS or 'or' in LASTLINE_FRAGMENTS):
							RESPONSES_ROW.append('');
						else:
							while (len(LASTLINE_FRAGMENTS) and not(VALID)):
								FRAGMENT = LASTLINE_FRAGMENTS.pop(0);

								if (FRAGMENT in ['human', 'professional', 'expert', 'ai', 'llm', 'model']):
									VALID = True;

									if (FRAGMENT in ['human', 'professional', 'expert']):
										RESPONSES_ROW.append('human');
									elif (FRAGMENT in ['ai', 'llm', 'model']):
										RESPONSES_ROW.append('AI');


							if (not(VALID)):
								RESPONSES_ROW.append(False);

					self.results['verdicts'][TYPE_NAME][SOURCE_NAME].loc[len(self.results['verdicts'][TYPE_NAME][SOURCE_NAME])] = RESPONSES_ROW;
		return (self.results['verdicts']);

	'''
	Determine the accuracy of the responses.

	Parameters:
		RESPONSES (dict): The expected responses and the actual responses
	Returns: (dict) the accuracy
	'''
	def grade(self):

		self.results['accuracy'] = pandas.DataFrame(columns=['model name', 'dataname', 'expected', 'human', 'AI', 'unsure', 'invalid']);

		for MODEL_NAME in list(self.imports['models'].keys()):
			for AUTHOR_TYPE in list(self.results['verdicts']):
				for DATANAME in list(self.results['verdicts'][AUTHOR_TYPE]):
					ROW = [MODEL_NAME, DATANAME, AUTHOR_TYPE, self.results['verdicts'][AUTHOR_TYPE][DATANAME][MODEL_NAME].value_counts().get('human', 0), self.results['verdicts'][AUTHOR_TYPE][DATANAME][MODEL_NAME].value_counts().get('AI', 0), self.results['verdicts'][AUTHOR_TYPE][DATANAME][MODEL_NAME].value_counts().get('', 0), self.results['verdicts'][AUTHOR_TYPE][DATANAME][MODEL_NAME].value_counts().get(False, 0)];

					self.results['accuracy'].loc[len(self.results['accuracy'])] = ROW;

		return (self.results['accuracy']);

	def export(self, PATH):
		self.results['accuracy'].to_csv(PATH);

	'''
	Import the necessary files.

	Parameters:
		PARAMETERS: the input files
	'''
	def __import(self, PARAMETERS):
		CONFIG = {'data': 1, 'models': 2};

		for NAME in list(CONFIG.keys()):
			self.imports[NAME] = json.load(open(PARAMETERS[CONFIG[NAME]]));

	'''
	Begin the results analysis.

	Parameters:
		PARAMETERS: launch parameters
	'''
	def __init__(self, PARAMETERS):
		if (not(len(PARAMETERS) == 4)):
			raise ImportError("You are only allowed to have the following (in order): \n\t1.\tresponses file\n\t2.\tthe models file, and\n\t3.\tthe output file. ");

		# Select the paths.
		self.__import(PARAMETERS);
		self.output['path'] = PARAMETERS[3];

		# Analyse the responses.
		self.analyse();
		self.grade();

		print(self.results['accuracy']);

		# Export the responses.
		self.export(self.output['path']);

if (__name__ == "__main__"):
	ResultsAnalysis(sys.argv);