refactor structure for sane workflow :)

2025-08-04 08:20:59 +09:00
parent 6dbc4a4e54
commit 540a84e772
4 changed files with 57 additions and 45 deletions
--- a/main.py
+++ b/main.py
@@ -0,0 +1,36 @@
+from pathlib import Path
+from concurrent.futures import ThreadPoolExecutor, as_completed
+
+from requests.adapters import Retry
+
+from adapter import TimeoutHTTPAdapter
+from scraper import Scraper
+
+
+scraper = Scraper(concurrency=5)
+
+retries = Retry(total=5, backoff_factor=0.1, status_forcelist=[404])
+scraper.mount('https://', TimeoutHTTPAdapter(timeout=1, max_retries=retries))
+
+document_path = Path('archives')
+attachment_path = document_path / 'attachments'
+attachment_path.mkdir(parents=True, exist_ok=True)
+
+try:
+    for offset in range(22760681, 22760681 + (10000 * 100), 10000):
+        with ThreadPoolExecutor() as executor:
+            posts = scraper.search('aoegame', 'name', -offset, 'ori')
+            print(f'searching offset {-offset}, found {len(posts)} post(s)')
+            as_completed([
+                executor.submit(
+                    scraper.view, 
+                    post,
+                    document_path=document_path,
+                    attachment_path=attachment_path
+                )
+                for post in scraper.search('aoegame', 'name', -offset, 'ori')
+            ])
+
+
+except KeyboardInterrupt:
+    print(':-)')
--- a/models/init.py
+++ b/models/init.py
@@ -0,0 +1 @@
+from .post import *
--- a/models/post.py
+++ b/models/post.py
@@ -0,0 +1,17 @@
+from typing import Optional
+from dataclasses import dataclass
+from datetime import datetime
+
+from bs4 import Tag
+
+
+@dataclass
+class Post:
+    boardId: str
+    postId: int
+    authorId: Optional[str] = None
+    authorName: Optional[str] = None
+    category: Optional[str] = None
+    title: Optional[str] = None
+    body: Optional[Tag] = None
+    created_at: Optional[datetime] = None
--- a/scraper.py
+++ b/scraper.py
@@ -1,8 +1,7 @@
 import re
 import hashlib

-from typing import Optional, Tuple, List
-from dataclasses import dataclass
+from typing import Tuple, List
 from datetime import datetime
 from zoneinfo import ZoneInfo
 from pathlib import Path
@@ -15,22 +14,10 @@ from requests import Session
 from requests.adapters import Retry
 from requests.cookies import create_cookie

-from bs4 import BeautifulSoup, Tag
+from bs4 import BeautifulSoup

 from cgi import parse_header
-from adapter import TimeoutHTTPAdapter
-
-
-@dataclass
-class Post:
-    boardId: str
-    postId: int
-    authorId: Optional[str] = None
-    authorName: Optional[str] = None
-    category: Optional[str] = None
-    title: Optional[str] = None
-    body: Optional[Tag] = None
-    created_at: Optional[datetime] = None
+from models import Post


 class Scraper(Session):
@@ -156,32 +143,3 @@ class Scraper(Session):

        if document_path:
            (document_path / f'{post.boardId}_{post.postId}.html').write_text(str(post.body))
-
-
-scraper = Scraper(concurrency=5)
-
-retries = Retry(total=5, backoff_factor=0.1, status_forcelist=[404])
-scraper.mount('https://', TimeoutHTTPAdapter(timeout=1, max_retries=retries))
-
-document_path = Path('archives')
-attachment_path = document_path / 'attachments'
-attachment_path.mkdir(parents=True, exist_ok=True)
-
-try:
-    for offset in range(22760681, 22760681 + (10000 * 100), 10000):
-        with ThreadPoolExecutor() as executor:
-            posts = scraper.search('aoegame', 'name', -offset, 'ori')
-            print(f'searching offset {-offset}, found {len(posts)} post(s)')
-            as_completed([
-                executor.submit(
-                    scraper.view, 
-                    post,
-                    document_path=document_path,
-                    attachment_path=attachment_path
-                )
-                for post in scraper.search('aoegame', 'name', -offset, 'ori')
-            ])
-
-
-except KeyboardInterrupt:
-    print(':-)')