from pathlib import Path from concurrent.futures import ThreadPoolExecutor, as_completed from requests.adapters import Retry from adapter import TimeoutHTTPAdapter from scraper import Scraper scraper = Scraper(concurrency=5) retries = Retry(total=5, backoff_factor=0.1, status_forcelist=[404]) scraper.mount('https://', TimeoutHTTPAdapter(timeout=1, max_retries=retries)) document_path = Path('archives') attachment_path = document_path / 'attachments' attachment_path.mkdir(parents=True, exist_ok=True) try: for offset in range(22760681, 22760681 + (10000 * 100), 10000): with ThreadPoolExecutor() as executor: posts = scraper.search('aoegame', 'name', -offset, 'ori') print(f'searching offset {-offset}, found {len(posts)} post(s)') as_completed([ executor.submit( scraper.view, post, document_path=document_path, attachment_path=attachment_path ) for post in scraper.search('aoegame', 'name', -offset, 'ori') ]) except KeyboardInterrupt: print(':-)')