refactor structure for sane workflow :)
This commit is contained in:
36
main.py
Normal file
36
main.py
Normal file
@@ -0,0 +1,36 @@
|
||||
from pathlib import Path
|
||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||
|
||||
from requests.adapters import Retry
|
||||
|
||||
from adapter import TimeoutHTTPAdapter
|
||||
from scraper import Scraper
|
||||
|
||||
|
||||
scraper = Scraper(concurrency=5)
|
||||
|
||||
retries = Retry(total=5, backoff_factor=0.1, status_forcelist=[404])
|
||||
scraper.mount('https://', TimeoutHTTPAdapter(timeout=1, max_retries=retries))
|
||||
|
||||
document_path = Path('archives')
|
||||
attachment_path = document_path / 'attachments'
|
||||
attachment_path.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
try:
|
||||
for offset in range(22760681, 22760681 + (10000 * 100), 10000):
|
||||
with ThreadPoolExecutor() as executor:
|
||||
posts = scraper.search('aoegame', 'name', -offset, 'ori')
|
||||
print(f'searching offset {-offset}, found {len(posts)} post(s)')
|
||||
as_completed([
|
||||
executor.submit(
|
||||
scraper.view,
|
||||
post,
|
||||
document_path=document_path,
|
||||
attachment_path=attachment_path
|
||||
)
|
||||
for post in scraper.search('aoegame', 'name', -offset, 'ori')
|
||||
])
|
||||
|
||||
|
||||
except KeyboardInterrupt:
|
||||
print(':-)')
|
Reference in New Issue
Block a user