add base build_db code
This commit is contained in:
parent
1ed3e60f16
commit
08f0dd27f7
4 changed files with 75 additions and 1 deletions
4
.gitignore
vendored
4
.gitignore
vendored
|
@ -160,3 +160,7 @@ cython_debug/
|
||||||
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
||||||
#.idea/
|
#.idea/
|
||||||
|
|
||||||
|
|
||||||
|
*.csv
|
||||||
|
*.csv.gz
|
||||||
|
*.db
|
||||||
|
|
12
README.md
12
README.md
|
@ -1,3 +1,15 @@
|
||||||
# e621_api_cloner
|
# e621_api_cloner
|
||||||
|
|
||||||
use e621 db dumps to create your own e621 api server without needing to be ratelimited for too much scraping (this is a very specific use case i promise i am useful to society)
|
use e621 db dumps to create your own e621 api server without needing to be ratelimited for too much scraping (this is a very specific use case i promise i am useful to society)
|
||||||
|
|
||||||
|
## how
|
||||||
|
|
||||||
|
```sh
|
||||||
|
python3 -m venv env
|
||||||
|
env/bin/pip install -Ur requirements.txt
|
||||||
|
|
||||||
|
# go to https://e621.net/db_export/ to find out the available db dumps
|
||||||
|
# database file available on e621.db
|
||||||
|
env/bin/python ./build_database.py 2022-08-27
|
||||||
|
env/bin/python ./e621_api_cloner.py ./e621.db
|
||||||
|
```
|
||||||
|
|
55
build_database.py
Normal file
55
build_database.py
Normal file
|
@ -0,0 +1,55 @@
|
||||||
|
import asyncio
|
||||||
|
import logging
|
||||||
|
import shutil
|
||||||
|
import tempfile
|
||||||
|
import sys
|
||||||
|
from dataclasses import dataclass
|
||||||
|
from urllib.parse import urlparse
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import aiohttp
|
||||||
|
|
||||||
|
log = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class Context:
|
||||||
|
session: aiohttp.ClientSession
|
||||||
|
|
||||||
|
|
||||||
|
async def main():
|
||||||
|
wanted_date = sys.argv[1]
|
||||||
|
urls = (
|
||||||
|
f"https://e621.net/db_export/tags-{wanted_date}.csv.gz",
|
||||||
|
f"https://e621.net/db_export/posts-{wanted_date}.csv.gz",
|
||||||
|
)
|
||||||
|
|
||||||
|
async with aiohttp.ClientSession() as session:
|
||||||
|
ctx = Context(session)
|
||||||
|
|
||||||
|
for url in urls:
|
||||||
|
parsed = urlparse(url)
|
||||||
|
parsed_path = Path(parsed.path)
|
||||||
|
output_path = Path.cwd() / parsed_path.name
|
||||||
|
if not output_path.exists():
|
||||||
|
log.info("downloading %r into %s", url, output_path)
|
||||||
|
async with ctx.session.get(url) as resp:
|
||||||
|
assert resp.status == 200
|
||||||
|
|
||||||
|
with tempfile.TemporaryFile() as temp_fd:
|
||||||
|
async for chunk in resp.content.iter_chunked(8192):
|
||||||
|
temp_fd.write(chunk)
|
||||||
|
|
||||||
|
# write to output
|
||||||
|
log.info("copying temp to output")
|
||||||
|
with output_path.open(mode="wb") as output_fd:
|
||||||
|
shutil.copyfileobj(temp_fd, output_fd)
|
||||||
|
else:
|
||||||
|
log.info("file %s already exists, ignoring", output_path)
|
||||||
|
|
||||||
|
# now that everythings downloaded, compile the db
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
logging.basicConfig(level=logging.INFO)
|
||||||
|
asyncio.run(main())
|
3
requirements.txt
Normal file
3
requirements.txt
Normal file
|
@ -0,0 +1,3 @@
|
||||||
|
aiohttp==3.8.1
|
||||||
|
aiosqlite==0.17.0
|
||||||
|
Quart==0.18.0
|
Loading…
Reference in a new issue