add: implement results analysis and grading functionality
This commit is contained in:
		
							parent
							
								
									9fd769e5b9
								
							
						
					
					
						commit
						27e6dd6653
					
				
					 1 changed files with 122 additions and 0 deletions
				
			
		
							
								
								
									
										122
									
								
								tests/results.py
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										122
									
								
								tests/results.py
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,122 @@ | ||||||
|  | ''' | ||||||
|  | 
 | ||||||
|  | ''' | ||||||
|  | 
 | ||||||
|  | import json; | ||||||
|  | import os; | ||||||
|  | import sys; | ||||||
|  | import pandas; | ||||||
|  | import re; | ||||||
|  | 
 | ||||||
|  | class ResultsAnalysis: | ||||||
|  | 	results = {}; | ||||||
|  | 	output = {}; | ||||||
|  | 	imports = {}; | ||||||
|  | 
 | ||||||
|  | 	''' | ||||||
|  | 	Get the final verdicts by the LLM. | ||||||
|  | 
 | ||||||
|  | 	Returns: (dict) the verdicts | ||||||
|  | 	''' | ||||||
|  | 	def analyse(self): | ||||||
|  | 		self.results['verdicts'] = {}; | ||||||
|  | 
 | ||||||
|  | 		for TYPE_NAME in list(self.imports['data'].keys()): | ||||||
|  | 			self.results['verdicts'][TYPE_NAME] = {}; | ||||||
|  | 
 | ||||||
|  | 			# Loop through each source | ||||||
|  | 			for SOURCE_NAME in list(self.imports['data'][TYPE_NAME].keys()): | ||||||
|  | 				self.results['verdicts'][TYPE_NAME][SOURCE_NAME] = pandas.DataFrame(columns=list(self.imports['models'].keys())); | ||||||
|  | 
 | ||||||
|  | 				# Get the response of each LLM. | ||||||
|  | 				for RESPONSES_ALL in self.imports['data'][TYPE_NAME][SOURCE_NAME]: | ||||||
|  | 					RESPONSES_ROW = []; | ||||||
|  | 
 | ||||||
|  | 					for RESPONSES in RESPONSES_ALL.values(): | ||||||
|  | 						# Split each fragment. | ||||||
|  | 						LASTLINE_FRAGMENTS = re.split(r'[-/ \s]+', re.sub('\t', ' ', RESPONSES[-1]).lower().strip()); | ||||||
|  | 
 | ||||||
|  | 						# Flag | ||||||
|  | 						VALID = False; | ||||||
|  | 
 | ||||||
|  | 						if ('maybe' in LASTLINE_FRAGMENTS or 'or' in LASTLINE_FRAGMENTS): | ||||||
|  | 							RESPONSES_ROW.append(''); | ||||||
|  | 						else: | ||||||
|  | 							while (len(LASTLINE_FRAGMENTS) and not(VALID)): | ||||||
|  | 								FRAGMENT = LASTLINE_FRAGMENTS.pop(0); | ||||||
|  | 
 | ||||||
|  | 								if (FRAGMENT in ['human', 'professional', 'expert', 'ai', 'llm', 'model']): | ||||||
|  | 									VALID = True; | ||||||
|  | 
 | ||||||
|  | 									if (FRAGMENT in ['human', 'professional', 'expert']): | ||||||
|  | 										RESPONSES_ROW.append('human'); | ||||||
|  | 									elif (FRAGMENT in ['ai', 'llm', 'model']): | ||||||
|  | 										RESPONSES_ROW.append('AI'); | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | 							if (not(VALID)): | ||||||
|  | 								RESPONSES_ROW.append(False); | ||||||
|  | 
 | ||||||
|  | 					self.results['verdicts'][TYPE_NAME][SOURCE_NAME].loc[len(self.results['verdicts'][TYPE_NAME][SOURCE_NAME])] = RESPONSES_ROW; | ||||||
|  | 		return (self.results['verdicts']); | ||||||
|  | 
 | ||||||
|  | 	''' | ||||||
|  | 	Determine the accuracy of the responses. | ||||||
|  | 
 | ||||||
|  | 	Parameters: | ||||||
|  | 		RESPONSES (dict): The expected responses and the actual responses | ||||||
|  | 	Returns: (dict) the accuracy | ||||||
|  | 	''' | ||||||
|  | 	def grade(self): | ||||||
|  | 
 | ||||||
|  | 		self.results['accuracy'] = pandas.DataFrame(columns=['model name', 'dataname', 'expected', 'human', 'AI', 'unsure', 'invalid']); | ||||||
|  | 
 | ||||||
|  | 		for MODEL_NAME in list(self.imports['models'].keys()):  | ||||||
|  | 			for AUTHOR_TYPE in list(self.results['verdicts']):  | ||||||
|  | 				for DATANAME in list(self.results['verdicts'][AUTHOR_TYPE]): | ||||||
|  | 					ROW = [MODEL_NAME, DATANAME, AUTHOR_TYPE, self.results['verdicts'][AUTHOR_TYPE][DATANAME][MODEL_NAME].value_counts().get('human', 0), self.results['verdicts'][AUTHOR_TYPE][DATANAME][MODEL_NAME].value_counts().get('AI', 0), self.results['verdicts'][AUTHOR_TYPE][DATANAME][MODEL_NAME].value_counts().get('', 0), self.results['verdicts'][AUTHOR_TYPE][DATANAME][MODEL_NAME].value_counts().get(False, 0)];  | ||||||
|  | 
 | ||||||
|  | 					self.results['accuracy'].loc[len(self.results['accuracy'])] = ROW; | ||||||
|  | 		 | ||||||
|  | 		return (self.results['accuracy']); | ||||||
|  | 
 | ||||||
|  | 	def export(self, PATH):  | ||||||
|  | 		self.results['accuracy'].to_csv(PATH); | ||||||
|  | 
 | ||||||
|  | 	''' | ||||||
|  | 	Import the necessary files. | ||||||
|  | 
 | ||||||
|  | 	Parameters: | ||||||
|  | 		PARAMETERS: the input files | ||||||
|  | 	''' | ||||||
|  | 	def __import(self, PARAMETERS): | ||||||
|  | 		CONFIG = {'data': 1, 'models': 2}; | ||||||
|  | 
 | ||||||
|  | 		for NAME in list(CONFIG.keys()): | ||||||
|  | 			self.imports[NAME] = json.load(open(PARAMETERS[CONFIG[NAME]])); | ||||||
|  | 
 | ||||||
|  | 	''' | ||||||
|  | 	Begin the results analysis. | ||||||
|  | 
 | ||||||
|  | 	Parameters: | ||||||
|  | 		PARAMETERS: launch parameters | ||||||
|  | 	''' | ||||||
|  | 	def __init__(self, PARAMETERS): | ||||||
|  | 		if (not(len(PARAMETERS) == 4)): | ||||||
|  | 			raise ImportError("You are only allowed to have the following (in order): \n\t1.\tresponses file\n\t2.\tthe models file, and\n\t3.\tthe output file. "); | ||||||
|  | 
 | ||||||
|  | 		# Select the paths. | ||||||
|  | 		self.__import(PARAMETERS); | ||||||
|  | 		self.output['path'] = PARAMETERS[3]; | ||||||
|  | 
 | ||||||
|  | 		# Analyse the responses.  | ||||||
|  | 		self.analyse(); | ||||||
|  | 		self.grade(); | ||||||
|  | 
 | ||||||
|  | 		print(self.results['accuracy']); | ||||||
|  | 
 | ||||||
|  | 		# Export the responses.  | ||||||
|  | 		self.export(self.output['path']); | ||||||
|  | 
 | ||||||
|  | if (__name__ == "__main__"): | ||||||
|  | 	ResultsAnalysis(sys.argv); | ||||||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue