migrate to aiohttp

2025-08-04 09:56:02 +09:00
parent 540a84e772
commit 8ad93caa90
10 changed files with 175 additions and 189 deletions
--- a/utils/init.py
+++ b/utils/init.py
--- a/utils/middlewares.py
+++ b/utils/middlewares.py
@@ -0,0 +1,8 @@
+import asyncio
+
+from aiohttp import ClientRequest, ClientResponse, ClientHandlerType
+
+class SemaphoreMiddleware(asyncio.Semaphore):
+    async def __call__(self, req: ClientRequest, handler: ClientHandlerType) -> ClientResponse:
+        async with self:
+            return await handler(req)
--- a/utils/scraper.py
+++ b/utils/scraper.py
@@ -0,0 +1,134 @@
+import re
+from typing import Optional, List
+from datetime import datetime
+from zoneinfo import ZoneInfo
+
+from aiohttp import ClientSession
+from bs4 import BeautifulSoup
+
+from .typings import BoardPath, SearchType
+from models import Post
+
+
+class Scraper(ClientSession):
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+        # 모바일 페이지 및 첨부 파일 요청 시 필요한 기본 헤더 값
+        self.headers['User-Agent'] = '(Android)'
+        self.headers['Referer'] = 'https://m.dcinside.com/board/aoegame'
+
+        # 게시글 목록 조회로 한 번에 불러올 항목 수
+        self.cookie_jar.update_cookies({
+            'list_count': '200'
+        })
+
+    async def __aenter__(self) -> 'Scraper':
+        return self
+
+
+    async def list(
+        self, 
+        boardId: str,
+        boardPath: BoardPath = 'board',
+        page: int = 1,
+        categoryId: int = 0,
+        only_recommended: bool = False,
+        only_notice: bool = False,
+        search_type: Optional[SearchType] = None,
+        search_position: Optional[int] = None, 
+        search_value: Optional[str]     = None
+    ) -> List[Post]:
+        """
+        특정 게시판으로부터 특정 조건에 맞는 게시글 목록을 가져옵니다
+
+        :param boardId:             게시판 아이디
+        :param boardPath:           게시판 경로(종류)
+        :param page:                페이지 번호
+        :param categoryId:          말머리 아이디
+        :param only_recommended:    개념글 게시글만 조회할지?
+        :param only_notice:         공지 게시글만 조회할지?
+        :param search_type:         검색 종류
+        :param search_position:     검색 지점
+        :param search_value:        검색어
+        """
+
+        url = f'https://m.dcinside.com/{boardPath}/{boardId}'
+        params = {
+            'page': page,
+            'headid': categoryId,
+            'recommend': only_recommended and '1' or '0',
+            'notice': only_notice and '1' or '0',
+            's_type': search_type or '',
+            's_pos': search_position or '',
+            'serval': search_value or ''
+        }
+
+        async with self.get(url, params=params) as response:
+            html = await response.text()
+            document = BeautifulSoup(html, 'lxml')
+
+        return [
+            Post(
+                id=int(re.findall(r'/\d+', tag.select_one('a[href]:first-child')['href'])[0][1:]),
+                boardId=boardId,
+                boardPath=boardPath
+            )
+            for tag in document.select('.gall-detail-lnktb')
+        ]
+    
+    async def view(self, post: Post):
+        """
+        게시글 내용을 조회합니다
+
+        :param post:    조회할 게시글 인스턴스
+        """
+        
+        async with self.get(f'https://m.dcinside.com/{post.boardPath}/{post.boardId}/{post.id}') as response:
+            html = await response.text()
+            document = BeautifulSoup(html, 'lxml')
+
+        # 상단 제목 요소는 `li`로 나누어져있고 무슨 지랄을 해도 정확히 2개임
+        # 만약 아니라면 어처피 파싱 무결성 전체가 깨질테니 예외 처리는 나도 몰?루
+        authorTag, timestampTag, *_ = document.select('.gallview-tit-box .ginfo2 > li')
+        authorAnchorTag = authorTag.select_one('a')
+
+        # 작성일 파싱
+        post.created_at = (
+            datetime
+                .strptime(timestampTag.get_text(strip=True), '%Y.%m.%d %H:%M')
+                .replace(tzinfo=ZoneInfo('Asia/Seoul'))
+
+        )
+
+        # 작성자 정보 파싱
+        if authorAnchorTag:
+            # 작성자 요소에 앵커 태그가 있다면 갤로그가 존재하는 상태임
+            post.authorId = re.findall(r'\/\w+$', authorAnchorTag['href'])[0][1:]
+            post.authorName = authorAnchorTag.get_text(strip=True)
+        else:
+            authorParts = authorTag.get_text(strip=True).split('(')
+            post.authorId = authorParts.pop()[:-1].strip() # 123.123) -> 123.123
+            post.authorName = authorParts.pop().strip()
+
+        # 모바일 웹에서 말머리와 제목은 `\n`으로 분리되어있음
+        titleTexts = (
+            document
+                .select_one('.gallview-tit-box .tit')
+                .get_text(strip=True)
+                .split('\n')
+        )
+
+        # 제목과 말머리 파싱
+        post.title = titleTexts.pop().strip()
+        post.category = titleTexts.pop()[1:~1].strip() # [XX] -> XX
+
+        # 본문 파싱
+        post.body = document.select_one('.thum-txtin')
+
+        # 불필요한 본문 요소 제거
+        for tag in post.body.select('script, style'):
+            tag.extract()
+
+        print(f'{post.boardId}/{post.id}: {post.title}')
--- a/utils/typings.py
+++ b/utils/typings.py
@@ -0,0 +1,6 @@
+from typing import Literal
+
+
+BoardPath = Literal['board', 'mini', 'person']
+
+SearchType = Literal['subject_m', 'subject', 'memo', 'name', 'comment']