diff --git a/main.py b/main.py new file mode 100644 index 0000000..9bda43c --- /dev/null +++ b/main.py @@ -0,0 +1,36 @@ +from pathlib import Path +from concurrent.futures import ThreadPoolExecutor, as_completed + +from requests.adapters import Retry + +from adapter import TimeoutHTTPAdapter +from scraper import Scraper + + +scraper = Scraper(concurrency=5) + +retries = Retry(total=5, backoff_factor=0.1, status_forcelist=[404]) +scraper.mount('https://', TimeoutHTTPAdapter(timeout=1, max_retries=retries)) + +document_path = Path('archives') +attachment_path = document_path / 'attachments' +attachment_path.mkdir(parents=True, exist_ok=True) + +try: + for offset in range(22760681, 22760681 + (10000 * 100), 10000): + with ThreadPoolExecutor() as executor: + posts = scraper.search('aoegame', 'name', -offset, 'ori') + print(f'searching offset {-offset}, found {len(posts)} post(s)') + as_completed([ + executor.submit( + scraper.view, + post, + document_path=document_path, + attachment_path=attachment_path + ) + for post in scraper.search('aoegame', 'name', -offset, 'ori') + ]) + + +except KeyboardInterrupt: + print(':-)') diff --git a/models/__init__.py b/models/__init__.py new file mode 100644 index 0000000..9835eb9 --- /dev/null +++ b/models/__init__.py @@ -0,0 +1 @@ +from .post import * diff --git a/models/post.py b/models/post.py new file mode 100644 index 0000000..c2c7356 --- /dev/null +++ b/models/post.py @@ -0,0 +1,17 @@ +from typing import Optional +from dataclasses import dataclass +from datetime import datetime + +from bs4 import Tag + + +@dataclass +class Post: + boardId: str + postId: int + authorId: Optional[str] = None + authorName: Optional[str] = None + category: Optional[str] = None + title: Optional[str] = None + body: Optional[Tag] = None + created_at: Optional[datetime] = None diff --git a/scraper.py b/scraper.py index fb9bb16..ac9994a 100644 --- a/scraper.py +++ b/scraper.py @@ -1,8 +1,7 @@ import re import hashlib -from typing import Optional, Tuple, List -from dataclasses import dataclass +from typing import Tuple, List from datetime import datetime from zoneinfo import ZoneInfo from pathlib import Path @@ -15,22 +14,10 @@ from requests import Session from requests.adapters import Retry from requests.cookies import create_cookie -from bs4 import BeautifulSoup, Tag +from bs4 import BeautifulSoup from cgi import parse_header -from adapter import TimeoutHTTPAdapter - - -@dataclass -class Post: - boardId: str - postId: int - authorId: Optional[str] = None - authorName: Optional[str] = None - category: Optional[str] = None - title: Optional[str] = None - body: Optional[Tag] = None - created_at: Optional[datetime] = None +from models import Post class Scraper(Session): @@ -156,32 +143,3 @@ class Scraper(Session): if document_path: (document_path / f'{post.boardId}_{post.postId}.html').write_text(str(post.body)) - - -scraper = Scraper(concurrency=5) - -retries = Retry(total=5, backoff_factor=0.1, status_forcelist=[404]) -scraper.mount('https://', TimeoutHTTPAdapter(timeout=1, max_retries=retries)) - -document_path = Path('archives') -attachment_path = document_path / 'attachments' -attachment_path.mkdir(parents=True, exist_ok=True) - -try: - for offset in range(22760681, 22760681 + (10000 * 100), 10000): - with ThreadPoolExecutor() as executor: - posts = scraper.search('aoegame', 'name', -offset, 'ori') - print(f'searching offset {-offset}, found {len(posts)} post(s)') - as_completed([ - executor.submit( - scraper.view, - post, - document_path=document_path, - attachment_path=attachment_path - ) - for post in scraper.search('aoegame', 'name', -offset, 'ori') - ]) - - -except KeyboardInterrupt: - print(':-)')