ED_LRR/process.py

173 lines
4.7 KiB
Python
Raw Normal View History

2019-06-05 23:15:49 +00:00
import ujson as json
from tqdm import tqdm
from pprint import pprint
import itertools as ITT
import os
import sys
import csv
import sqlite3
import pandas as pd
def is_scoopable(entry):
first = entry.type.split()[0]
return first == "Neutron" or first == "White" or first in "KGBFOAM"
def get_mult(name):
try:
first = name.split()[0]
except:
return 1
if first == "Neutron":
return 4
if first == "White":
return 1.5
return 1
def dict_factory(cursor, row):
d = {}
for idx, col in enumerate(cursor.description):
d[col[0]] = row[idx]
return d
def blocks(files, size=65536):
while True:
b = files.read(size)
if not b:
break
yield b
def getlines(f, fn, show_progbar=False):
f.seek(0, 2)
size = f.tell()
f.seek(0)
progbar = tqdm(
desc="Processing " + fn,
total=size,
unit="b",
unit_scale=True,
unit_divisor=1024,
ascii=True,
leave=True,
disable=(not show_progbar),
)
buffer = []
for block in blocks(f):
progbar.n = f.tell()
progbar.update(0)
if buffer:
buffer += (buffer.pop(0) + block).splitlines(keepends=True)
else:
buffer += block.splitlines(keepends=True)
while buffer and buffer[0].endswith("\n"):
try:
yield json.loads(buffer.pop(0).strip().rstrip(","))
except ValueError:
pass
while buffer:
try:
yield json.loads(buffer.pop(0).strip().rstrip(","))
except ValueError:
pass
def process_file(fn, show_progbar=False):
with open(fn, "r") as f:
for line in tqdm(
getlines(f, fn, show_progbar),
desc=fn,
unit=" lines",
unit_scale=True,
ascii=True,
leave=True,
disable=(not show_progbar),
):
yield line
if not os.path.isfile("stars.jl"):
print("Filtering for Neutron Stars")
with open("stars.jl", "w") as neut:
for body in process_file("bodies.json", True):
T = body.get("type") or ""
if "Star" in T:
neut.write(json.dumps(body) + "\n")
def load_systems(load=False):
load = not os.path.isfile("systems.db")
cache = sqlite3.connect("systems.db")
cache.row_factory = dict_factory
c = cache.cursor()
if load:
print("Caching Systems")
c.execute("DROP TABLE IF EXISTS systems")
c.execute(
"CREATE TABLE systems (id64 int primary key, name text, x real, y real, z real)"
)
cache.commit()
recs = []
for system in process_file("systemsWithCoordinates.json", True):
rec = [
system["id64"],
system["name"],
system["coords"]["x"],
system["coords"]["y"],
system["coords"]["z"],
]
recs.append(rec)
if len(recs) % 1024 * 1024 == 0:
c.executemany("INSERT INTO systems VALUES (?,?,?,?,?)", recs)
recs.clear()
c.executemany("INSERT INTO systems VALUES (?,?,?,?,?)", recs)
cache.commit()
return cache, c
if not os.path.isfile("stars.csv"):
cache, cur = load_systems()
rows = []
with open("stars.csv", "w", newline="") as sys_csv:
csv_writer = csv.writer(sys_csv, dialect="excel")
for neut in process_file("stars.jl", True):
cur.execute(
"SELECT * FROM systems WHERE id64==?", (neut.get("systemId64"),)
)
system = cur.fetchone()
if not system:
continue
row = [
neut["systemId64"],
neut["subType"],
neut["name"],
get_mult(neut["subType"]),
system["x"],
system["y"],
system["z"],
]
rows.append(row)
if len(rows) > 1024:
csv_writer.writerows(rows)
rows.clear()
csv_writer.writerows(rows)
print()
cache.close()
if not os.path.isfile("stars.kdt"):
tqdm.pandas(ascii=True, leave=True)
print("Loading data...")
data = pd.read_csv(
"stars.csv",
encoding="utf-8",
names=["id", "type", "name", "mult", "x", "y", "z"],
)
print("Cleaning data...")
data.type.fillna("Unknown", inplace=True)
data.drop_duplicates("id", inplace=True)
print("Writing CSV...")
data.to_csv("stars.csv", header=False, index=False)