mandu-crawler/main.py

from pathlib import Path
from concurrent.futures import ThreadPoolExecutor, as_completed

from requests.adapters import Retry

from adapter import TimeoutHTTPAdapter
from scraper import Scraper


scraper = Scraper(concurrency=5)

retries = Retry(total=5, backoff_factor=0.1, status_forcelist=[404])
scraper.mount('https://', TimeoutHTTPAdapter(timeout=1, max_retries=retries))

document_path = Path('archives')
attachment_path = document_path / 'attachments'
attachment_path.mkdir(parents=True, exist_ok=True)

try:
    for offset in range(22760681, 22760681 + (10000 * 100), 10000):
        with ThreadPoolExecutor() as executor:
            posts = scraper.search('aoegame', 'name', -offset, 'ori')
            print(f'searching offset {-offset}, found {len(posts)} post(s)')
            as_completed([
                executor.submit(
                    scraper.view,
                    post,
                    document_path=document_path,
                    attachment_path=attachment_path
                )
                for post in scraper.search('aoegame', 'name', -offset, 'ori')
            ])


except KeyboardInterrupt:
    print(':-)')