1
0

refactor structure for sane workflow :)

This commit is contained in:
2025-08-04 08:20:59 +09:00
parent 6dbc4a4e54
commit 540a84e772
4 changed files with 57 additions and 45 deletions

36
main.py Normal file
View File

@@ -0,0 +1,36 @@
from pathlib import Path
from concurrent.futures import ThreadPoolExecutor, as_completed
from requests.adapters import Retry
from adapter import TimeoutHTTPAdapter
from scraper import Scraper
scraper = Scraper(concurrency=5)
retries = Retry(total=5, backoff_factor=0.1, status_forcelist=[404])
scraper.mount('https://', TimeoutHTTPAdapter(timeout=1, max_retries=retries))
document_path = Path('archives')
attachment_path = document_path / 'attachments'
attachment_path.mkdir(parents=True, exist_ok=True)
try:
for offset in range(22760681, 22760681 + (10000 * 100), 10000):
with ThreadPoolExecutor() as executor:
posts = scraper.search('aoegame', 'name', -offset, 'ori')
print(f'searching offset {-offset}, found {len(posts)} post(s)')
as_completed([
executor.submit(
scraper.view,
post,
document_path=document_path,
attachment_path=attachment_path
)
for post in scraper.search('aoegame', 'name', -offset, 'ori')
])
except KeyboardInterrupt:
print(':-)')