ED_LRR/dumps/process.py

import ujson as json
from tqdm import tqdm
from pprint import pprint
import itertools as ITT
import os
import sys
import csv
import sqlite3
import pandas as pd
from urllib.parse import urljoin


def is_scoopable(entry):
    first = entry.type.split()[0]
    return first == "Neutron" or first == "White" or first in "KGBFOAM"


def get_mult(name):
    try:
        first = name.split()[0]
    except:
        return 1
    if first == "Neutron":
        return 4
    if first == "White":
        return 1.5
    return 1


def dict_factory(cursor, row):
    d = {}
    for idx, col in enumerate(cursor.description):
        d[col[0]] = row[idx]
    return d


def blocks(files, size=65536):
    while True:
        b = files.read(size)
        if not b:
            break
        yield b


def getlines(f, fn, show_progbar=False):
    f.seek(0, 2)
    size = f.tell()
    f.seek(0)
    progbar = tqdm(
        desc="Processing " + fn,
        total=size,
        unit="b",
        unit_scale=True,
        unit_divisor=1024,
        ascii=True,
        leave=True,
        disable=(not show_progbar),
    )
    buffer = []
    for block in blocks(f):
        progbar.n = f.tell()
        progbar.update(0)
        if buffer:
            buffer += (buffer.pop(0) + block).splitlines(keepends=True)
        else:
            buffer += block.splitlines(keepends=True)
        while buffer and buffer[0].endswith("\n"):
            try:
                yield json.loads(buffer.pop(0).strip().rstrip(","))
            except ValueError:
                pass
    while buffer:
        try:
            yield json.loads(buffer.pop(0).strip().rstrip(","))
        except ValueError:
            pass


def process_file(fn, show_progbar=False):
    with open(fn, "r") as f:
        for line in tqdm(
            getlines(f, fn, show_progbar),
            desc=fn,
            unit=" lines",
            unit_scale=True,
            ascii=True,
            leave=True,
            disable=(not show_progbar),
        ):
            yield line


if not (
    os.path.isfile("bodies.json") and os.path.isfile("systemsWithCoordinates.json")
):
    exit(
        "Please download bodies.json and systemsWithCoordinates.json from https://www.edsm.net/en/nightly-dumps/"
    )

if not os.path.isfile("stars.jl"):
    print("Filtering for Stars")
    with open("stars.jl", "w") as neut:
        for body in process_file("bodies.json", True):
            T = body.get("type") or ""
            if "Star" in T:
                neut.write(json.dumps(body) + "\n")


def load_systems(load=False):
    load = not os.path.isfile("systems.db")
    cache = sqlite3.connect("systems.db")
    cache.row_factory = dict_factory
    c = cache.cursor()
    if load:
        print("Caching Systems")
        c.execute("DROP TABLE IF EXISTS systems")
        c.execute(
            "CREATE TABLE systems (id64 int primary key, name text, x real, y real, z real)"
        )
        cache.commit()
        recs = []
        for system in process_file("systemsWithCoordinates.json", True):
            rec = [
                system["id64"],
                system["name"],
                system["coords"]["x"],
                system["coords"]["y"],
                system["coords"]["z"],
            ]
            recs.append(rec)
            if len(recs) % 1024 * 1024 == 0:
                c.executemany("INSERT INTO systems VALUES (?,?,?,?,?)", recs)
                recs.clear()
        c.executemany("INSERT INTO systems VALUES (?,?,?,?,?)", recs)
        cache.commit()
    return cache, c


if not os.path.isfile("stars.csv"):
    cache, cur = load_systems()
    rows = []
    with open("stars.csv", "w", newline="") as sys_csv:
        csv_writer = csv.writer(sys_csv, dialect="excel")
        for neut in process_file("stars.jl", True):
            cur.execute(
                "SELECT * FROM systems WHERE id64==?", (neut.get("systemId64"),)
            )
            system = cur.fetchone()
            if not system:
                continue
            row = [
                neut["systemId64"],
                neut["subType"],
                neut["name"],
                get_mult(neut["subType"]),
                system["x"],
                system["y"],
                system["z"],
            ]
            rows.append(row)
            if len(rows) > 1024:
                csv_writer.writerows(rows)
                rows.clear()
        csv_writer.writerows(rows)
        print()
    cache.close()

if not os.path.isfile("stars.csv"):
    tqdm.pandas(ascii=True, leave=True)
    print("Loading data...")
    data = pd.read_csv(
        "stars.csv",
        encoding="utf-8",
        names=["id", "type", "name", "mult", "x", "y", "z"],
    )
    print("Cleaning data...")
    data.type.fillna("Unknown", inplace=True)
    data.drop_duplicates("id", inplace=True)
    print("Writing CSV...")
    data.to_csv("stars.csv", header=False, index=False)