refactor structure for sane workflow :)

2025-08-04 08:20:59 +09:00
parent 6dbc4a4e54
commit 540a84e772
4 changed files with 57 additions and 45 deletions
--- a/main.py
+++ b/main.py
@@ -0,0 +1,36 @@
 from pathlib import Path
 from concurrent.futures import ThreadPoolExecutor, as_completed
 from requests.adapters import Retry
 from adapter import TimeoutHTTPAdapter
 from scraper import Scraper
 scraper = Scraper(concurrency=5)
 retries = Retry(total=5, backoff_factor=0.1, status_forcelist=[404])
 scraper.mount('https://', TimeoutHTTPAdapter(timeout=1, max_retries=retries))
 document_path = Path('archives')
 attachment_path = document_path / 'attachments'
 attachment_path.mkdir(parents=True, exist_ok=True)
 try:
    for offset in range(22760681, 22760681 + (10000 * 100), 10000):
        with ThreadPoolExecutor() as executor:
            posts = scraper.search('aoegame', 'name', -offset, 'ori')
            print(f'searching offset {-offset}, found {len(posts)} post(s)')
            as_completed([
                executor.submit(
                    scraper.view, 
                    post,
                    document_path=document_path,
                    attachment_path=attachment_path
                )
                for post in scraper.search('aoegame', 'name', -offset, 'ori')
            ])
 except KeyboardInterrupt:
    print(':-)')
--- a/models/init.py
+++ b/models/init.py
@@ -0,0 +1 @@
 from .post import *
--- a/models/post.py
+++ b/models/post.py
@@ -0,0 +1,17 @@
 from typing import Optional
 from dataclasses import dataclass
 from datetime import datetime
 from bs4 import Tag
@dataclass
 class Post:
    boardId: str
    postId: int
    authorId: Optional[str] = None
    authorName: Optional[str] = None
    category: Optional[str] = None
    title: Optional[str] = None
    body: Optional[Tag] = None
    created_at: Optional[datetime] = None
--- a/scraper.py
+++ b/scraper.py
@@ -1,8 +1,7 @@
 import re
 import hashlib
-from typing import Optional, Tuple, List
+from typing import Tuple, List
 from dataclasses import dataclass
 from datetime import datetime
 from zoneinfo import ZoneInfo
 from pathlib import Path
@@ -15,22 +14,10 @@ from requests import Session
 from requests.adapters import Retry
 from requests.cookies import create_cookie
-from bs4 import BeautifulSoup, Tag
+from bs4 import BeautifulSoup
 from cgi import parse_header
-from adapter import TimeoutHTTPAdapter
+from models import Post
@dataclass
 class Post:
    boardId: str
    postId: int
    authorId: Optional[str] = None
    authorName: Optional[str] = None
    category: Optional[str] = None
    title: Optional[str] = None
    body: Optional[Tag] = None
    created_at: Optional[datetime] = None
 class Scraper(Session):
@@ -156,32 +143,3 @@ class Scraper(Session):
        if document_path:
            (document_path / f'{post.boardId}_{post.postId}.html').write_text(str(post.body))
 scraper = Scraper(concurrency=5)
 retries = Retry(total=5, backoff_factor=0.1, status_forcelist=[404])
 scraper.mount('https://', TimeoutHTTPAdapter(timeout=1, max_retries=retries))
 document_path = Path('archives')
 attachment_path = document_path / 'attachments'
 attachment_path.mkdir(parents=True, exist_ok=True)
 try:
    for offset in range(22760681, 22760681 + (10000 * 100), 10000):
        with ThreadPoolExecutor() as executor:
            posts = scraper.search('aoegame', 'name', -offset, 'ori')
            print(f'searching offset {-offset}, found {len(posts)} post(s)')
            as_completed([
                executor.submit(
                    scraper.view, 
                    post,
                    document_path=document_path,
                    attachment_path=attachment_path
                )
                for post in scraper.search('aoegame', 'name', -offset, 'ori')
            ])
 except KeyboardInterrupt:
    print(':-)')