1
0

refactor structure for sane workflow :)

This commit is contained in:
2025-08-04 08:20:59 +09:00
parent 6dbc4a4e54
commit 540a84e772
4 changed files with 57 additions and 45 deletions

36
main.py Normal file
View File

@@ -0,0 +1,36 @@
from pathlib import Path
from concurrent.futures import ThreadPoolExecutor, as_completed
from requests.adapters import Retry
from adapter import TimeoutHTTPAdapter
from scraper import Scraper
scraper = Scraper(concurrency=5)
retries = Retry(total=5, backoff_factor=0.1, status_forcelist=[404])
scraper.mount('https://', TimeoutHTTPAdapter(timeout=1, max_retries=retries))
document_path = Path('archives')
attachment_path = document_path / 'attachments'
attachment_path.mkdir(parents=True, exist_ok=True)
try:
for offset in range(22760681, 22760681 + (10000 * 100), 10000):
with ThreadPoolExecutor() as executor:
posts = scraper.search('aoegame', 'name', -offset, 'ori')
print(f'searching offset {-offset}, found {len(posts)} post(s)')
as_completed([
executor.submit(
scraper.view,
post,
document_path=document_path,
attachment_path=attachment_path
)
for post in scraper.search('aoegame', 'name', -offset, 'ori')
])
except KeyboardInterrupt:
print(':-)')

1
models/__init__.py Normal file
View File

@@ -0,0 +1 @@
from .post import *

17
models/post.py Normal file
View File

@@ -0,0 +1,17 @@
from typing import Optional
from dataclasses import dataclass
from datetime import datetime
from bs4 import Tag
@dataclass
class Post:
boardId: str
postId: int
authorId: Optional[str] = None
authorName: Optional[str] = None
category: Optional[str] = None
title: Optional[str] = None
body: Optional[Tag] = None
created_at: Optional[datetime] = None

View File

@@ -1,8 +1,7 @@
import re
import hashlib
from typing import Optional, Tuple, List
from dataclasses import dataclass
from typing import Tuple, List
from datetime import datetime
from zoneinfo import ZoneInfo
from pathlib import Path
@@ -15,22 +14,10 @@ from requests import Session
from requests.adapters import Retry
from requests.cookies import create_cookie
from bs4 import BeautifulSoup, Tag
from bs4 import BeautifulSoup
from cgi import parse_header
from adapter import TimeoutHTTPAdapter
@dataclass
class Post:
boardId: str
postId: int
authorId: Optional[str] = None
authorName: Optional[str] = None
category: Optional[str] = None
title: Optional[str] = None
body: Optional[Tag] = None
created_at: Optional[datetime] = None
from models import Post
class Scraper(Session):
@@ -156,32 +143,3 @@ class Scraper(Session):
if document_path:
(document_path / f'{post.boardId}_{post.postId}.html').write_text(str(post.body))
scraper = Scraper(concurrency=5)
retries = Retry(total=5, backoff_factor=0.1, status_forcelist=[404])
scraper.mount('https://', TimeoutHTTPAdapter(timeout=1, max_retries=retries))
document_path = Path('archives')
attachment_path = document_path / 'attachments'
attachment_path.mkdir(parents=True, exist_ok=True)
try:
for offset in range(22760681, 22760681 + (10000 * 100), 10000):
with ThreadPoolExecutor() as executor:
posts = scraper.search('aoegame', 'name', -offset, 'ori')
print(f'searching offset {-offset}, found {len(posts)} post(s)')
as_completed([
executor.submit(
scraper.view,
post,
document_path=document_path,
attachment_path=attachment_path
)
for post in scraper.search('aoegame', 'name', -offset, 'ori')
])
except KeyboardInterrupt:
print(':-)')