diff --git a/data.ipynb b/data.ipynb new file mode 100644 index 0000000..2535239 --- /dev/null +++ b/data.ipynb @@ -0,0 +1,825 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# An Evaluation on the Effectiveness of Large Language Models in Self-Detection\n", + "## Data Compilation\n", + "This notebook will contain the scripts for importing and merging the testing and the training data together. " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Importing the modules" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Requirement already satisfied: pip in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from -r dependencies.txt (line 1)) (24.3.1)\n", + "Requirement already satisfied: kaggle in /Users/hksaw/Library/Python/3.12/lib/python/site-packages (from -r dependencies.txt (line 2)) (1.6.17)\n", + "Requirement already satisfied: kagglehub in /Users/hksaw/Library/Python/3.12/lib/python/site-packages (from -r dependencies.txt (line 3)) (0.3.4)\n", + "Requirement already satisfied: ollama in /Users/hksaw/Library/Python/3.12/lib/python/site-packages (from -r dependencies.txt (line 4)) (0.4.2)\n", + "Requirement already satisfied: pandas in /Users/hksaw/Library/Python/3.12/lib/python/site-packages (from -r dependencies.txt (line 5)) (2.2.3)\n", + "Requirement already satisfied: six>=1.10 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from kaggle->-r dependencies.txt (line 2)) (1.16.0)\n", + "Requirement already satisfied: certifi>=2023.7.22 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from kaggle->-r dependencies.txt (line 2)) (2024.8.30)\n", + "Requirement already satisfied: python-dateutil in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from kaggle->-r dependencies.txt (line 2)) (2.9.0.post0)\n", + "Requirement already satisfied: requests in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from kaggle->-r dependencies.txt (line 2)) (2.32.3)\n", + "Requirement already satisfied: tqdm in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from kaggle->-r dependencies.txt (line 2)) (4.66.5)\n", + "Requirement already satisfied: python-slugify in /Users/hksaw/Library/Python/3.12/lib/python/site-packages (from kaggle->-r dependencies.txt (line 2)) (8.0.4)\n", + "Requirement already satisfied: urllib3 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from kaggle->-r dependencies.txt (line 2)) (2.2.3)\n", + "Requirement already satisfied: bleach in /Users/hksaw/Library/Python/3.12/lib/python/site-packages (from kaggle->-r dependencies.txt (line 2)) (6.2.0)\n", + "Requirement already satisfied: packaging in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from kagglehub->-r dependencies.txt (line 3)) (24.1)\n", + "Requirement already satisfied: httpx<0.28.0,>=0.27.0 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from ollama->-r dependencies.txt (line 4)) (0.27.2)\n", + "Requirement already satisfied: pydantic<3.0.0,>=2.9.0 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from ollama->-r dependencies.txt (line 4)) (2.9.2)\n", + "Requirement already satisfied: numpy>=1.26.0 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from pandas->-r dependencies.txt (line 5)) (2.0.2)\n", + "Requirement already satisfied: pytz>=2020.1 in /Users/hksaw/Library/Python/3.12/lib/python/site-packages (from pandas->-r dependencies.txt (line 5)) (2024.2)\n", + "Requirement already satisfied: tzdata>=2022.7 in /Users/hksaw/Library/Python/3.12/lib/python/site-packages (from pandas->-r dependencies.txt (line 5)) (2024.2)\n", + "Requirement already satisfied: anyio in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from httpx<0.28.0,>=0.27.0->ollama->-r dependencies.txt (line 4)) (4.6.2.post1)\n", + "Requirement already satisfied: httpcore==1.* in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from httpx<0.28.0,>=0.27.0->ollama->-r dependencies.txt (line 4)) (1.0.6)\n", + "Requirement already satisfied: idna in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from httpx<0.28.0,>=0.27.0->ollama->-r dependencies.txt (line 4)) (3.10)\n", + "Requirement already satisfied: sniffio in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from httpx<0.28.0,>=0.27.0->ollama->-r dependencies.txt (line 4)) (1.3.1)\n", + "Requirement already satisfied: h11<0.15,>=0.13 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from httpcore==1.*->httpx<0.28.0,>=0.27.0->ollama->-r dependencies.txt (line 4)) (0.14.0)\n", + "Requirement already satisfied: annotated-types>=0.6.0 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from pydantic<3.0.0,>=2.9.0->ollama->-r dependencies.txt (line 4)) (0.7.0)\n", + "Requirement already satisfied: pydantic-core==2.23.4 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from pydantic<3.0.0,>=2.9.0->ollama->-r dependencies.txt (line 4)) (2.23.4)\n", + "Requirement already satisfied: typing-extensions>=4.6.1 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from pydantic<3.0.0,>=2.9.0->ollama->-r dependencies.txt (line 4)) (4.12.2)\n", + "Requirement already satisfied: webencodings in /Users/hksaw/Library/Python/3.12/lib/python/site-packages (from bleach->kaggle->-r dependencies.txt (line 2)) (0.5.1)\n", + "Requirement already satisfied: text-unidecode>=1.3 in /Users/hksaw/Library/Python/3.12/lib/python/site-packages (from python-slugify->kaggle->-r dependencies.txt (line 2)) (1.3)\n", + "Requirement already satisfied: charset-normalizer<4,>=2 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from requests->kaggle->-r dependencies.txt (line 2)) (3.4.0)\n" + ] + } + ], + "source": [ + "%%bash\n", + "pip3 install --upgrade -r dependencies.txt --user" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", + " from .autonotebook import tqdm as notebook_tqdm\n" + ] + } + ], + "source": [ + "import json, os, pandas\n", + "from IPython.display import display, HTML, Markdown\n", + "from collections import Counter\n", + "import kagglehub, huggingface_hub\n", + "import random" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### File input management" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Downloading the dataset" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "def download_datasets(): \n", + " STATUS = display(\"\", display_id=True);\n", + "\n", + " # Read the dataset listings. \n", + " SOURCES = json.load(open(\"data/sources.json\"));\n", + "\n", + " # Loop through each dataset target.\n", + " for NAME_ID in range(len(SOURCES.keys())): \n", + " # Get the data about the dataset. \n", + " NAME = list(SOURCES.keys())[NAME_ID];\n", + " DOWNLOAD_STATE = False;\n", + " DATASET_DATA = SOURCES[NAME];\n", + "\n", + " if (os.path.isdir((\"data/datasets/\" + NAME))): \n", + " if (os.path.isfile((f\"data/datasets/{NAME}/{DATASET_DATA['filename']}\"))):\n", + " DOWNLOAD_STATE = True;\n", + " \n", + " if not(DOWNLOAD_STATE): \n", + " STATUS.update(HTML(f\"