migrate to aiohttp

2025-08-04 09:56:02 +09:00
parent 540a84e772
commit 8ad93caa90
10 changed files with 175 additions and 189 deletions
--- a/adapter.py
+++ b/adapter.py
@@ -1,14 +0,0 @@
 from requests.adapters import HTTPAdapter
 class TimeoutHTTPAdapter(HTTPAdapter):
    def __init__(self, *args, **kwargs):
        if "timeout" in kwargs:
            self.timeout = kwargs["timeout"]
            del kwargs["timeout"]
        super().__init__(*args, **kwargs)
    def send(self, request, **kwargs):
        timeout = kwargs.get("timeout")
        if timeout is None and hasattr(self, 'timeout'):
            kwargs["timeout"] = self.timeout
        return super().send(request, **kwargs)
--- a/main.py
+++ b/main.py
@@ -1,36 +0,0 @@
 from pathlib import Path
 from concurrent.futures import ThreadPoolExecutor, as_completed
 from requests.adapters import Retry
 from adapter import TimeoutHTTPAdapter
 from scraper import Scraper
 scraper = Scraper(concurrency=5)
 retries = Retry(total=5, backoff_factor=0.1, status_forcelist=[404])
 scraper.mount('https://', TimeoutHTTPAdapter(timeout=1, max_retries=retries))
 document_path = Path('archives')
 attachment_path = document_path / 'attachments'
 attachment_path.mkdir(parents=True, exist_ok=True)
 try:
    for offset in range(22760681, 22760681 + (10000 * 100), 10000):
        with ThreadPoolExecutor() as executor:
            posts = scraper.search('aoegame', 'name', -offset, 'ori')
            print(f'searching offset {-offset}, found {len(posts)} post(s)')
            as_completed([
                executor.submit(
                    scraper.view, 
                    post,
                    document_path=document_path,
                    attachment_path=attachment_path
                )
                for post in scraper.search('aoegame', 'name', -offset, 'ori')
            ])
 except KeyboardInterrupt:
    print(':-)')
--- a/models/init.py
+++ b/models/init.py
@@ -1 +1 @@
-from .post import *
+from .post import Post
--- a/models/post.py
+++ b/models/post.py
@@ -4,11 +4,14 @@ from datetime import datetime
 from bs4 import Tag
 from utils.typings import BoardPath
@dataclass
 class Post:
    id: int
    boardId: str
-    postId: int
+    boardPath: BoardPath
    authorId: Optional[str] = None
    authorName: Optional[str] = None
    category: Optional[str] = None
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,8 +1,16 @@
 aiohappyeyeballs==2.6.1
 aiohttp==3.12.15
 aiosignal==1.4.0
 attrs==25.3.0
 beautifulsoup4==4.13.4
 certifi==2025.7.14
 charset-normalizer==3.4.2
 frozenlist==1.7.0
 idna==3.10
-requests==2.32.4
+lxml==6.0.0
 multidict==6.6.3
 propcache==0.3.2
 soupsieve==2.7
 typing_extensions==4.14.1
 urllib3==2.5.0
 yarl==1.20.1
--- a/scraper.py
+++ b/scraper.py
@@ -1,145 +1,22 @@
-import re
+import asyncio
 import hashlib
 from typing import Tuple, List
 from datetime import datetime
 from zoneinfo import ZoneInfo
 from pathlib import Path
 from tempfile import NamedTemporaryFile
-from threading import Semaphore
+from utils.middlewares import SemaphoreMiddleware
-from concurrent.futures import ThreadPoolExecutor, as_completed
+from utils.scraper import Scraper
 from requests import Session
 from requests.adapters import Retry
 from requests.cookies import create_cookie
 from bs4 import BeautifulSoup
 from cgi import parse_header
 from models import Post
-class Scraper(Session):
+async def main():
    middlewares = (
        SemaphoreMiddleware(5),
    )
-    def __init__(self, concurrency = 5):
+    async with Scraper(middlewares=middlewares) as scraper:
-        super().__init__()
+        posts = await scraper.list('roh', 'person')
        self.semaphore = Semaphore(concurrency)
        self.headers['User-Agent'] = '(Android)'
        self.headers['Referer'] = 'https://m.dcinside.com/board/6974gay'
        self.cookies.set_cookie(create_cookie(
            name='list_count',
            value='200',
            domain='.dcinside.com'
        ))
    def download_attachment(self, url: str, save_dir: Path) -> Tuple[str, Path]:
        with self.semaphore:
            res = self.get(url, stream=True)
            res.raise_for_status()
            hash = hashlib.sha1()
            # fuck this shit
            _, parts = parse_header(res.headers.get('Content-Disposition'))
            fname = parts.get('filename', '')
            fext = fname.split('.').pop()
            with NamedTemporaryFile('wb', dir=save_dir) as file:
                for chunk in res.iter_content(chunk_size=8192):
                    if chunk:
                        file.write(chunk)
                        hash.update(chunk)
                return url, Path(file.name).rename(save_dir / f'{hash.hexdigest()}.{fext}')
    def replace_attachment(self, post: Post, save_dir: Path):
        src_to_tags = {
            img.attrs['data-original'].strip(): img
            for img in post.body.select('img[data-original]')
        }
        with ThreadPoolExecutor() as executor:
            futures = [
                executor.submit(self.download_attachment, src, save_dir)
                for src in src_to_tags.keys()
            ]
            for future in as_completed(futures):
                src, path = future.result()
                src_to_tags[src]['src'] = path
    def search(self, boardId: str, type: str, offset: int, value: str) -> List[Post]:
        with self.semaphore:
            res = self.get(
                f'https://m.dcinside.com/board/{boardId}',
                params={
                    's_type': type,
                    's_pos': offset,
                    'serval': value
                })
            res.raise_for_status()
        document = BeautifulSoup(res.text, 'html.parser')
        return [
            Post(
                boardId=boardId, 
                postId=int(re.findall(r'/\d+', tag.attrs.get('href'))[0][1:])
            )
            for tag in document.select('.gall-detail-lnktb a[href]:first-child')
        ]
    def view(self, post: Post, document_path: Path = None, attachment_path: Path = None) -> Post:
        with self.semaphore:
            res = self.get(f'https://m.dcinside.com/board/{post.boardId}/{post.postId}')
            res.raise_for_status()
-        document = BeautifulSoup(res.text, 'html.parser')
+        for future in asyncio.as_completed([scraper.view(p) for p in posts]):
            await future
        titleWrapTags = document.select('.gallview-tit-box .ginfo2 > li')
-        timeTag = titleWrapTags.pop()
+if __name__ == '__main__':
-        authorTag = titleWrapTags.pop()
+    asyncio.run(main())
        authorAnchorTag = authorTag.select_one('a')
        titleParts = (
            document
                .select_one('.gallview-tit-box .tit')
                .get_text(strip=True)
                .split('\r\n')
        )
        post.title = titleParts.pop().strip()
        post.category = titleParts.pop()[1:-1].strip()
        if authorAnchorTag:
            post.authorId = re.findall(r'\/\w+$', authorAnchorTag.attrs.get('href'))[0][1:]
            post.authorName = authorAnchorTag.get_text(strip=True)
        else:
            authorParts = authorTag.get_text(strip=True).split('(')
            post.authorId = authorParts[1][:-1].strip()
            post.authorName = authorParts[0].strip()
        post.created_at = (
            datetime
                .strptime(timeTag.get_text(strip=True), '%Y.%m.%d %H:%M')
                .replace(tzinfo=ZoneInfo('Asia/Seoul'))
        )
        post.body = document.select_one('.thum-txtin')
        # ㅋㅋㅋㅋz
        if post.authorName != 'ori':
            return
        print(f'yoinked {post.boardId}/{post.postId}, written by {post.authorName}: {post.title}')
        if attachment_path:
            self.replace_attachment(post, attachment_path)
        if document_path:
            (document_path / f'{post.boardId}_{post.postId}.html').write_text(str(post.body))
--- a/utils/init.py
+++ b/utils/init.py
--- a/utils/middlewares.py
+++ b/utils/middlewares.py
@@ -0,0 +1,8 @@
 import asyncio
 from aiohttp import ClientRequest, ClientResponse, ClientHandlerType
 class SemaphoreMiddleware(asyncio.Semaphore):
    async def __call__(self, req: ClientRequest, handler: ClientHandlerType) -> ClientResponse:
        async with self:
            return await handler(req)
--- a/utils/scraper.py
+++ b/utils/scraper.py
@@ -0,0 +1,134 @@
 import re
 from typing import Optional, List
 from datetime import datetime
 from zoneinfo import ZoneInfo
 from aiohttp import ClientSession
 from bs4 import BeautifulSoup
 from .typings import BoardPath, SearchType
 from models import Post
 class Scraper(ClientSession):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        # 모바일 페이지 및 첨부 파일 요청 시 필요한 기본 헤더 값
        self.headers['User-Agent'] = '(Android)'
        self.headers['Referer'] = 'https://m.dcinside.com/board/aoegame'
        # 게시글 목록 조회로 한 번에 불러올 항목 수
        self.cookie_jar.update_cookies({
            'list_count': '200'
        })
    async def __aenter__(self) -> 'Scraper':
        return self
    async def list(
        self, 
        boardId: str,
        boardPath: BoardPath = 'board',
        page: int = 1,
        categoryId: int = 0,
        only_recommended: bool = False,
        only_notice: bool = False,
        search_type: Optional[SearchType] = None,
        search_position: Optional[int] = None, 
        search_value: Optional[str]     = None
    ) -> List[Post]:
        """
        특정 게시판으로부터 특정 조건에 맞는 게시글 목록을 가져옵니다
        :param boardId:             게시판 아이디
        :param boardPath:           게시판 경로(종류)
        :param page:                페이지 번호
        :param categoryId:          말머리 아이디
        :param only_recommended:    개념글 게시글만 조회할지?
        :param only_notice:         공지 게시글만 조회할지?
        :param search_type:         검색 종류
        :param search_position:     검색 지점
        :param search_value:        검색어
        """
        url = f'https://m.dcinside.com/{boardPath}/{boardId}'
        params = {
            'page': page,
            'headid': categoryId,
            'recommend': only_recommended and '1' or '0',
            'notice': only_notice and '1' or '0',
            's_type': search_type or '',
            's_pos': search_position or '',
            'serval': search_value or ''
        }
        async with self.get(url, params=params) as response:
            html = await response.text()
            document = BeautifulSoup(html, 'lxml')
        return [
            Post(
                id=int(re.findall(r'/\d+', tag.select_one('a[href]:first-child')['href'])[0][1:]),
                boardId=boardId,
                boardPath=boardPath
            )
            for tag in document.select('.gall-detail-lnktb')
        ]
    async def view(self, post: Post):
        """
        게시글 내용을 조회합니다
        :param post:    조회할 게시글 인스턴스
        """
        async with self.get(f'https://m.dcinside.com/{post.boardPath}/{post.boardId}/{post.id}') as response:
            html = await response.text()
            document = BeautifulSoup(html, 'lxml')
        # 상단 제목 요소는 `li`로 나누어져있고 무슨 지랄을 해도 정확히 2개임
        # 만약 아니라면 어처피 파싱 무결성 전체가 깨질테니 예외 처리는 나도 몰?루
        authorTag, timestampTag, *_ = document.select('.gallview-tit-box .ginfo2 > li')
        authorAnchorTag = authorTag.select_one('a')
        # 작성일 파싱
        post.created_at = (
            datetime
                .strptime(timestampTag.get_text(strip=True), '%Y.%m.%d %H:%M')
                .replace(tzinfo=ZoneInfo('Asia/Seoul'))
        )
        # 작성자 정보 파싱
        if authorAnchorTag:
            # 작성자 요소에 앵커 태그가 있다면 갤로그가 존재하는 상태임
            post.authorId = re.findall(r'\/\w+$', authorAnchorTag['href'])[0][1:]
            post.authorName = authorAnchorTag.get_text(strip=True)
        else:
            authorParts = authorTag.get_text(strip=True).split('(')
            post.authorId = authorParts.pop()[:-1].strip() # 123.123) -> 123.123
            post.authorName = authorParts.pop().strip()
        # 모바일 웹에서 말머리와 제목은 `\n`으로 분리되어있음
        titleTexts = (
            document
                .select_one('.gallview-tit-box .tit')
                .get_text(strip=True)
                .split('\n')
        )
        # 제목과 말머리 파싱
        post.title = titleTexts.pop().strip()
        post.category = titleTexts.pop()[1:~1].strip() # [XX] -> XX
        # 본문 파싱
        post.body = document.select_one('.thum-txtin')
        # 불필요한 본문 요소 제거
        for tag in post.body.select('script, style'):
            tag.extract()
        print(f'{post.boardId}/{post.id}: {post.title}')
--- a/utils/typings.py
+++ b/utils/typings.py
@@ -0,0 +1,6 @@
 from typing import Literal
 BoardPath = Literal['board', 'mini', 'person']
 SearchType = Literal['subject_m', 'subject', 'memo', 'name', 'comment']