add: implement results analysis and grading functionality

2024-12-07 23:43:42 +08:00 · 2024-12-07 23:43:42 +08:00 · 27e6dd6653
commit 27e6dd6653
parent 9fd769e5b9
1 changed files with 122 additions and 0 deletions
--- a/tests/results.py
+++ b/tests/results.py
@ -0,0 +1,122 @@
+'''
+
+'''
+
+import json;
+import os;
+import sys;
+import pandas;
+import re;
+
+class ResultsAnalysis:
+	results = {};
+	output = {};
+	imports = {};
+
+	'''
+	Get the final verdicts by the LLM.
+
+	Returns: (dict) the verdicts
+	'''
+	def analyse(self):
+		self.results['verdicts'] = {};
+
+		for TYPE_NAME in list(self.imports['data'].keys()):
+			self.results['verdicts'][TYPE_NAME] = {};
+
+			# Loop through each source
+			for SOURCE_NAME in list(self.imports['data'][TYPE_NAME].keys()):
+				self.results['verdicts'][TYPE_NAME][SOURCE_NAME] = pandas.DataFrame(columns=list(self.imports['models'].keys()));
+
+				# Get the response of each LLM.
+				for RESPONSES_ALL in self.imports['data'][TYPE_NAME][SOURCE_NAME]:
+					RESPONSES_ROW = [];
+
+					for RESPONSES in RESPONSES_ALL.values():
+						# Split each fragment.
+						LASTLINE_FRAGMENTS = re.split(r'[-/ \s]+', re.sub('\t', ' ', RESPONSES[-1]).lower().strip());
+
+						# Flag
+						VALID = False;
+
+						if ('maybe' in LASTLINE_FRAGMENTS or 'or' in LASTLINE_FRAGMENTS):
+							RESPONSES_ROW.append('');
+						else:
+							while (len(LASTLINE_FRAGMENTS) and not(VALID)):
+								FRAGMENT = LASTLINE_FRAGMENTS.pop(0);
+
+								if (FRAGMENT in ['human', 'professional', 'expert', 'ai', 'llm', 'model']):
+									VALID = True;
+
+									if (FRAGMENT in ['human', 'professional', 'expert']):
+										RESPONSES_ROW.append('human');
+									elif (FRAGMENT in ['ai', 'llm', 'model']):
+										RESPONSES_ROW.append('AI');
+
+
+							if (not(VALID)):
+								RESPONSES_ROW.append(False);
+
+					self.results['verdicts'][TYPE_NAME][SOURCE_NAME].loc[len(self.results['verdicts'][TYPE_NAME][SOURCE_NAME])] = RESPONSES_ROW;
+		return (self.results['verdicts']);
+
+	'''
+	Determine the accuracy of the responses.
+
+	Parameters:
+		RESPONSES (dict): The expected responses and the actual responses
+	Returns: (dict) the accuracy
+	'''
+	def grade(self):
+
+		self.results['accuracy'] = pandas.DataFrame(columns=['model name', 'dataname', 'expected', 'human', 'AI', 'unsure', 'invalid']);
+
+		for MODEL_NAME in list(self.imports['models'].keys()): 
+			for AUTHOR_TYPE in list(self.results['verdicts']): 
+				for DATANAME in list(self.results['verdicts'][AUTHOR_TYPE]):
+					ROW = [MODEL_NAME, DATANAME, AUTHOR_TYPE, self.results['verdicts'][AUTHOR_TYPE][DATANAME][MODEL_NAME].value_counts().get('human', 0), self.results['verdicts'][AUTHOR_TYPE][DATANAME][MODEL_NAME].value_counts().get('AI', 0), self.results['verdicts'][AUTHOR_TYPE][DATANAME][MODEL_NAME].value_counts().get('', 0), self.results['verdicts'][AUTHOR_TYPE][DATANAME][MODEL_NAME].value_counts().get(False, 0)]; 
+
+					self.results['accuracy'].loc[len(self.results['accuracy'])] = ROW;
+		
+		return (self.results['accuracy']);
+
+	def export(self, PATH): 
+		self.results['accuracy'].to_csv(PATH);
+
+	'''
+	Import the necessary files.
+
+	Parameters:
+		PARAMETERS: the input files
+	'''
+	def __import(self, PARAMETERS):
+		CONFIG = {'data': 1, 'models': 2};
+
+		for NAME in list(CONFIG.keys()):
+			self.imports[NAME] = json.load(open(PARAMETERS[CONFIG[NAME]]));
+
+	'''
+	Begin the results analysis.
+
+	Parameters:
+		PARAMETERS: launch parameters
+	'''
+	def __init__(self, PARAMETERS):
+		if (not(len(PARAMETERS) == 4)):
+			raise ImportError("You are only allowed to have the following (in order): \n\t1.\tresponses file\n\t2.\tthe models file, and\n\t3.\tthe output file. ");
+
+		# Select the paths.
+		self.__import(PARAMETERS);
+		self.output['path'] = PARAMETERS[3];
+
+		# Analyse the responses. 
+		self.analyse();
+		self.grade();
+
+		print(self.results['accuracy']);
+
+		# Export the responses. 
+		self.export(self.output['path']);
+
+if (__name__ == "__main__"):
+	ResultsAnalysis(sys.argv);