import ujson as json from tqdm import tqdm from pprint import pprint import itertools as ITT import os import sys import csv import sqlite3 import pandas as pd def is_scoopable(entry): first = entry.type.split()[0] return first == "Neutron" or first == "White" or first in "KGBFOAM" def get_mult(name): try: first = name.split()[0] except: return 1 if first == "Neutron": return 4 if first == "White": return 1.5 return 1 def dict_factory(cursor, row): d = {} for idx, col in enumerate(cursor.description): d[col[0]] = row[idx] return d def blocks(files, size=65536): while True: b = files.read(size) if not b: break yield b def getlines(f, fn, show_progbar=False): f.seek(0, 2) size = f.tell() f.seek(0) progbar = tqdm( desc="Processing " + fn, total=size, unit="b", unit_scale=True, unit_divisor=1024, ascii=True, leave=True, disable=(not show_progbar), ) buffer = [] for block in blocks(f): progbar.n = f.tell() progbar.update(0) if buffer: buffer += (buffer.pop(0) + block).splitlines(keepends=True) else: buffer += block.splitlines(keepends=True) while buffer and buffer[0].endswith("\n"): try: yield json.loads(buffer.pop(0).strip().rstrip(",")) except ValueError: pass while buffer: try: yield json.loads(buffer.pop(0).strip().rstrip(",")) except ValueError: pass def process_file(fn, show_progbar=False): with open(fn, "r") as f: for line in tqdm( getlines(f, fn, show_progbar), desc=fn, unit=" lines", unit_scale=True, ascii=True, leave=True, disable=(not show_progbar), ): yield line if not os.path.isfile("stars.jl"): print("Filtering for Neutron Stars") with open("stars.jl", "w") as neut: for body in process_file("bodies.json", True): T = body.get("type") or "" if "Star" in T: neut.write(json.dumps(body) + "\n") def load_systems(load=False): load = not os.path.isfile("systems.db") cache = sqlite3.connect("systems.db") cache.row_factory = dict_factory c = cache.cursor() if load: print("Caching Systems") c.execute("DROP TABLE IF EXISTS systems") c.execute( "CREATE TABLE systems (id64 int primary key, name text, x real, y real, z real)" ) cache.commit() recs = [] for system in process_file("systemsWithCoordinates.json", True): rec = [ system["id64"], system["name"], system["coords"]["x"], system["coords"]["y"], system["coords"]["z"], ] recs.append(rec) if len(recs) % 1024 * 1024 == 0: c.executemany("INSERT INTO systems VALUES (?,?,?,?,?)", recs) recs.clear() c.executemany("INSERT INTO systems VALUES (?,?,?,?,?)", recs) cache.commit() return cache, c if not os.path.isfile("stars.csv"): cache, cur = load_systems() rows = [] with open("stars.csv", "w", newline="") as sys_csv: csv_writer = csv.writer(sys_csv, dialect="excel") for neut in process_file("stars.jl", True): cur.execute( "SELECT * FROM systems WHERE id64==?", (neut.get("systemId64"),) ) system = cur.fetchone() if not system: continue row = [ neut["systemId64"], neut["subType"], neut["name"], get_mult(neut["subType"]), system["x"], system["y"], system["z"], ] rows.append(row) if len(rows) > 1024: csv_writer.writerows(rows) rows.clear() csv_writer.writerows(rows) print() cache.close() if not os.path.isfile("stars.kdt"): tqdm.pandas(ascii=True, leave=True) print("Loading data...") data = pd.read_csv( "stars.csv", encoding="utf-8", names=["id", "type", "name", "mult", "x", "y", "z"], ) print("Cleaning data...") data.type.fillna("Unknown", inplace=True) data.drop_duplicates("id", inplace=True) print("Writing CSV...") data.to_csv("stars.csv", header=False, index=False)