radical-bot/lib/ocr.py

93 lines
3.1 KiB
Python

import re
from datetime import datetime
from io import BytesIO
import pytz
from pytesseract import image_to_string
from PIL import Image
from lib.config import load_config
from lib.room import Room
__all__ = ["get_room_data"]
config = load_config()
RE_STRING = re.compile( # https://regex101.com/r/OkWfkC/1
r"(L-[0-9]{4}): " # room number (group 1)
# time-slot (group 2,3,4 - group 5,6,7)
r"([1-9]{1,2}):([0,3]{2})(am|pm) - ([1-9]{1,2}):([0,3]{2})(am|pm), "
# weekday (group 8)
r"(Monday|Tuesday|Wednesday|Thursday|Friday|Saturday|Sunday), "
# month (group 9)
r"(January|February|March|April|May|June|July|August|September|October|November|December) "
r"([1-9][0-9]|[1-9]), ([0-9]{4})", # day & year (group 10 & 11)
flags=re.M
)
class NotAMatchException(Exception):
pass
class NoMatchException(Exception):
pass
def add_zero_padding(value: str):
"""Adds zero-padding to a single digit value."""
if int(value) < 10:
return "0"+str(int(value)) # this removes leading 0 if already present
return value
def libcal_to_datetime(year: str, month: str, day: str, hour: str, minute: str, am_pm: str) -> datetime:
"""Takes date information as displayed by LibCal and turns it into a datetime object.
All values should be given in an unmodified string format."""
date = datetime.strptime(f"{year}-{month}-{add_zero_padding(day)}-"
f"{add_zero_padding(hour)}-{add_zero_padding(minute)}-{am_pm}",
"%Y-%B-%d-%I-%M-%p")
tz = pytz.timezone(config.time_zone)
return tz.localize(date) # adds timezone info to object
def correct_commas(string: str):
"""Ensures all commas have a space after them in the given string."""
return re.sub(r"(,)([^ ])", r"\1 \2", string)
def correct_newlines(string: str):
"""Replaces all newlines with a space in the given string."""
return string.replace("\n", " ")
def get_room_data(img: bytes) -> list[Room]:
"""Gets the room data from a given image"""
rooms: list[Room] = []
start_time: datetime | None = None
end_time: datetime | None = None
img_string = get_image_string(img)
img_string = correct_newlines(img_string)
img_string = correct_commas(img_string)
matches = re.finditer(RE_STRING, img_string)
for match in matches:
if isinstance(match, re.Match):
room_number = match.group(1)
start_time = libcal_to_datetime(match.group(11), match.group(
9), match.group(10), match.group(2), match.group(3), match.group(4))
end_time = libcal_to_datetime(match.group(11), match.group(
9), match.group(10), match.group(5), match.group(6), match.group(7))
room = Room(room_number, start_time, end_time)
rooms.append(room)
else:
raise NotAMatchException(match)
if len(rooms) == 0:
raise NoMatchException
return rooms
def get_image_string(img: bytes) -> str:
img_string = image_to_string(Image.open(BytesIO(img)))
img_string = correct_newlines(img_string)
img_string = correct_commas(img_string)
return img_string