ED_LRR/process.py

import ujson as json
from tqdm import tqdm
from pprint import pprint
import itertools as ITT
import os
import sys
import csv
import sqlite3
import pandas as pd


def is_scoopable(entry):
    first = entry.type.split()[0]
    return first == "Neutron" or first == "White" or first in "KGBFOAM"


def get_mult(name):
    try:
        first = name.split()[0]
    except:
        return 1
    if first == "Neutron":
        return 4
    if first == "White":
        return 1.5
    return 1


def dict_factory(cursor, row):
    d = {}
    for idx, col in enumerate(cursor.description):
        d[col[0]] = row[idx]
    return d


def blocks(files, size=65536):
    while True:
        b = files.read(size)
        if not b:
            break
        yield b


def getlines(f, fn, show_progbar=False):
    f.seek(0, 2)
    size = f.tell()
    f.seek(0)
    progbar = tqdm(
        desc="Processing " + fn,
        total=size,
        unit="b",
        unit_scale=True,
        unit_divisor=1024,
        ascii=True,
        leave=True,
        disable=(not show_progbar),
    )
    buffer = []
    for block in blocks(f):
        progbar.n = f.tell()
        progbar.update(0)
        if buffer:
            buffer += (buffer.pop(0) + block).splitlines(keepends=True)
        else:
            buffer += block.splitlines(keepends=True)
        while buffer and buffer[0].endswith("\n"):
            try:
                yield json.loads(buffer.pop(0).strip().rstrip(","))
            except ValueError:
                pass
    while buffer:
        try:
            yield json.loads(buffer.pop(0).strip().rstrip(","))
        except ValueError:
            pass


def process_file(fn, show_progbar=False):
    with open(fn, "r") as f:
        for line in tqdm(
            getlines(f, fn, show_progbar),
            desc=fn,
            unit=" lines",
            unit_scale=True,
            ascii=True,
            leave=True,
            disable=(not show_progbar),
        ):
            yield line


if not os.path.isfile("stars.jl"):
    print("Filtering for Neutron Stars")
    with open("stars.jl", "w") as neut:
        for body in process_file("bodies.json", True):
            T = body.get("type") or ""
            if "Star" in T:
                neut.write(json.dumps(body) + "\n")


def load_systems(load=False):
    load = not os.path.isfile("systems.db")
    cache = sqlite3.connect("systems.db")
    cache.row_factory = dict_factory
    c = cache.cursor()
    if load:
        print("Caching Systems")
        c.execute("DROP TABLE IF EXISTS systems")
        c.execute(
            "CREATE TABLE systems (id64 int primary key, name text, x real, y real, z real)"
        )
        cache.commit()
        recs = []
        for system in process_file("systemsWithCoordinates.json", True):
            rec = [
                system["id64"],
                system["name"],
                system["coords"]["x"],
                system["coords"]["y"],
                system["coords"]["z"],
            ]
            recs.append(rec)
            if len(recs) % 1024 * 1024 == 0:
                c.executemany("INSERT INTO systems VALUES (?,?,?,?,?)", recs)
                recs.clear()
        c.executemany("INSERT INTO systems VALUES (?,?,?,?,?)", recs)
        cache.commit()
    return cache, c


if not os.path.isfile("stars.csv"):
    cache, cur = load_systems()
    rows = []
    with open("stars.csv", "w", newline="") as sys_csv:
        csv_writer = csv.writer(sys_csv, dialect="excel")
        for neut in process_file("stars.jl", True):
            cur.execute(
                "SELECT * FROM systems WHERE id64==?", (neut.get("systemId64"),)
            )
            system = cur.fetchone()
            if not system:
                continue
            row = [
                neut["systemId64"],
                neut["subType"],
                neut["name"],
                get_mult(neut["subType"]),
                system["x"],
                system["y"],
                system["z"],
            ]
            rows.append(row)
            if len(rows) > 1024:
                csv_writer.writerows(rows)
                rows.clear()
        csv_writer.writerows(rows)
        print()
    cache.close()

if not os.path.isfile("stars.kdt"):
    tqdm.pandas(ascii=True, leave=True)
    print("Loading data...")
    data = pd.read_csv(
        "stars.csv",
        encoding="utf-8",
        names=["id", "type", "name", "mult", "x", "y", "z"],
    )
    print("Cleaning data...")
    data.type.fillna("Unknown", inplace=True)
    data.drop_duplicates("id", inplace=True)
    print("Writing CSV...")
    data.to_csv("stars.csv", header=False, index=False)
Initial commit 2019-06-05 23:15:49 +00:00			`import ujson as json`
			`from tqdm import tqdm`
			`from pprint import pprint`
			`import itertools as ITT`
			`import os`
			`import sys`
			`import csv`
			`import sqlite3`
			`import pandas as pd`


			`def is_scoopable(entry):`
			`first = entry.type.split()[0]`
			`return first == "Neutron" or first == "White" or first in "KGBFOAM"`


			`def get_mult(name):`
			`try:`
			`first = name.split()[0]`
			`except:`
			`return 1`
			`if first == "Neutron":`
			`return 4`
			`if first == "White":`
			`return 1.5`
			`return 1`


			`def dict_factory(cursor, row):`
			`d = {}`
			`for idx, col in enumerate(cursor.description):`
			`d[col[0]] = row[idx]`
			`return d`


			`def blocks(files, size=65536):`
			`while True:`
			`b = files.read(size)`
			`if not b:`
			`break`
			`yield b`


			`def getlines(f, fn, show_progbar=False):`
			`f.seek(0, 2)`
			`size = f.tell()`
			`f.seek(0)`
			`progbar = tqdm(`
			`desc="Processing " + fn,`
			`total=size,`
			`unit="b",`
			`unit_scale=True,`
			`unit_divisor=1024,`
			`ascii=True,`
			`leave=True,`
			`disable=(not show_progbar),`
			`)`
			`buffer = []`
			`for block in blocks(f):`
			`progbar.n = f.tell()`
			`progbar.update(0)`
			`if buffer:`
			`buffer += (buffer.pop(0) + block).splitlines(keepends=True)`
			`else:`
			`buffer += block.splitlines(keepends=True)`
			`while buffer and buffer[0].endswith("\n"):`
			`try:`
			`yield json.loads(buffer.pop(0).strip().rstrip(","))`
			`except ValueError:`
			`pass`
			`while buffer:`
			`try:`
			`yield json.loads(buffer.pop(0).strip().rstrip(","))`
			`except ValueError:`
			`pass`


			`def process_file(fn, show_progbar=False):`
			`with open(fn, "r") as f:`
			`for line in tqdm(`
			`getlines(f, fn, show_progbar),`
			`desc=fn,`
			`unit=" lines",`
			`unit_scale=True,`
			`ascii=True,`
			`leave=True,`
			`disable=(not show_progbar),`
			`):`
			`yield line`


			`if not os.path.isfile("stars.jl"):`
			`print("Filtering for Neutron Stars")`
			`with open("stars.jl", "w") as neut:`
			`for body in process_file("bodies.json", True):`
			`T = body.get("type") or ""`
			`if "Star" in T:`
			`neut.write(json.dumps(body) + "\n")`


			`def load_systems(load=False):`
			`load = not os.path.isfile("systems.db")`
			`cache = sqlite3.connect("systems.db")`
			`cache.row_factory = dict_factory`
			`c = cache.cursor()`
			`if load:`
			`print("Caching Systems")`
			`c.execute("DROP TABLE IF EXISTS systems")`
			`c.execute(`
			`"CREATE TABLE systems (id64 int primary key, name text, x real, y real, z real)"`
			`)`
			`cache.commit()`
			`recs = []`
			`for system in process_file("systemsWithCoordinates.json", True):`
			`rec = [`
			`system["id64"],`
			`system["name"],`
			`system["coords"]["x"],`
			`system["coords"]["y"],`
			`system["coords"]["z"],`
			`]`
			`recs.append(rec)`
			`if len(recs) % 1024 * 1024 == 0:`
			`c.executemany("INSERT INTO systems VALUES (?,?,?,?,?)", recs)`
			`recs.clear()`
			`c.executemany("INSERT INTO systems VALUES (?,?,?,?,?)", recs)`
			`cache.commit()`
			`return cache, c`


			`if not os.path.isfile("stars.csv"):`
			`cache, cur = load_systems()`
			`rows = []`
			`with open("stars.csv", "w", newline="") as sys_csv:`
			`csv_writer = csv.writer(sys_csv, dialect="excel")`
			`for neut in process_file("stars.jl", True):`
			`cur.execute(`
			`"SELECT * FROM systems WHERE id64==?", (neut.get("systemId64"),)`
			`)`
			`system = cur.fetchone()`
			`if not system:`
			`continue`
			`row = [`
			`neut["systemId64"],`
			`neut["subType"],`
			`neut["name"],`
			`get_mult(neut["subType"]),`
			`system["x"],`
			`system["y"],`
			`system["z"],`
			`]`
			`rows.append(row)`
			`if len(rows) > 1024:`
			`csv_writer.writerows(rows)`
			`rows.clear()`
			`csv_writer.writerows(rows)`
			`print()`
			`cache.close()`

			`if not os.path.isfile("stars.kdt"):`
			`tqdm.pandas(ascii=True, leave=True)`
			`print("Loading data...")`
			`data = pd.read_csv(`
			`"stars.csv",`
			`encoding="utf-8",`
			`names=["id", "type", "name", "mult", "x", "y", "z"],`
			`)`
			`print("Cleaning data...")`
			`data.type.fillna("Unknown", inplace=True)`
			`data.drop_duplicates("id", inplace=True)`
			`print("Writing CSV...")`
			`data.to_csv("stars.csv", header=False, index=False)`