feat: fetching attachment

2025-08-04 11:35:08 +09:00
parent 02fd0ae13e
commit 5679ce1d6a
4 changed files with 169 additions and 12 deletions
--- a/utils/scraper.py
+++ b/utils/scraper.py
@@ -1,13 +1,21 @@
 import re
+import shutil
+import hashlib
+import mimetypes
+import asyncio
+import urllib.parse as urlparse
+
 from typing import Optional, List
+from pathlib import Path
+from tempfile import NamedTemporaryFile
 from datetime import datetime
 from zoneinfo import ZoneInfo

-from aiohttp import ClientSession
+from aiohttp import ClientSession, ClientError
 from bs4 import BeautifulSoup

 from .typings import BoardPath, SearchType
-from models import Post
+from models import Attachment, Post


 class Scraper(ClientSession):
@@ -27,7 +35,6 @@ class Scraper(ClientSession):
    async def __aenter__(self) -> 'Scraper':
        return self

-
    async def list(
        self, 
        boardId: str,
@@ -53,7 +60,6 @@ class Scraper(ClientSession):
        :param search_position:     검색 지점
        :param search_value:        검색어
        """
-
        url = f'https://m.dcinside.com/{boardPath}/{boardId}'
        params = {
            'page': page,
@@ -84,7 +90,6 @@ class Scraper(ClientSession):

        :param post:    조회할 게시글 인스턴스
        """
-        
        async with self.get(f'https://m.dcinside.com/{post.boardPath}/{post.boardId}/{post.id}') as response:
            html = await response.text()
            document = BeautifulSoup(html, 'lxml')
@@ -122,7 +127,9 @@ class Scraper(ClientSession):

        # 제목과 말머리 파싱
        post.title = titleTexts.pop().strip()
-        post.category = titleTexts.pop()[1:~1].strip() # [XX] -> XX
+
+        if titleTexts:
+            post.category = titleTexts.pop()[1:~1].strip() # [XX] -> XX

        # 본문 파싱
        post.body = document.select_one('.thum-txtin')
@@ -132,3 +139,132 @@ class Scraper(ClientSession):
            tag.extract()

        print(f'{post.boardId}/{post.id}: {post.title}')
+
+    async def fetch_voice(self, id: str):
+        """
+        모바일 웹의 보이스 리플 iframe 페이지로부터 실제 파일 경로를 가져옵니다
+
+        :param id: 보이스 리플 아이디 (`vr` 인자)
+        """
+        params = {
+            'vr': id,
+            'vr_open': 1
+        }
+
+        async with await self.get('https://m.dcinside.com/voice/player', params=params) as response:
+            html = await response.text()
+            document = BeautifulSoup(html, 'lxml')
+
+        return document.select_one('input')['value']
+
+    async def fetch_video(self, id: str):
+        """
+        모바일 웹의 동영상 iframe 페이지로부터 실제 파일 경로를 가져옵니다
+
+        :param id: 동영상 아이디 (`no` 인자)
+        """
+        params = {
+            'no': id
+        }
+
+        async with await self.get('https://m.dcinside.com/movie/player', params=params) as response:
+            html = await response.text()
+            document = BeautifulSoup(html, 'lxml')
+
+        return document.select_one('source')['src']
+
+    async def download_attachment(
+        self, 
+        url: str, 
+        save_dir: Path,
+        chunk_size = 8192
+    ) -> Attachment:
+        """
+        첨부 파일을 받아옵니다
+
+        :param url:         받아올 첨부 파일의 주소
+        :param save_dir:    받아질 로컬 디렉터리 경로
+        :param chunk_size:  청크 크기
+        """
+        url_parsed = urlparse.urlparse(url)
+        url_params = urlparse.parse_qs(url_parsed.query)
+
+        hash = hashlib.sha1()
+        attachment = Attachment(
+            url=url,
+            source_url=url
+        )
+
+        if url.startswith('https://m.dcinside.com/voice/player'):
+            # 보이스 리플
+            attachment.source_url = await self.fetch_voice(url_params.get('vr'))
+            
+        elif url.startswith('https://m.dcinside.com/movie/player'):
+            # 동영상
+            attachment.source_url = await self.fetch_video(url_params.get('no'))
+
+        with NamedTemporaryFile('wb') as temp_file:
+            async with await self.get(attachment.source_url) as response:
+                async for chunk in response.content.iter_chunked(chunk_size):
+                    temp_file.write(chunk)
+                    hash.update(chunk)
+
+            # Content-Type 헤더로부터 확장자 알아내기
+            attachment.source_suffix = mimetypes.guess_extension(response.content_type)
+            
+            # Content-Disposition 헤더로부터 실제 파일 이름과 확장자 알아내기
+            if response.content_disposition and response.content_disposition.filename:
+                attachment.source_filename = response.content_disposition.filename
+                attachment.source_suffix = Path(attachment.source_filename).suffix
+
+            saved_path = save_dir / f'{hash.hexdigest()}{attachment.source_suffix}'
+            
+            # 임시로 받은 파일 옮기기
+            if not saved_path.exists():
+                shutil.copyfile(temp_file.name, saved_path)
+
+        return attachment
+
+    async def download_attachments(
+        self,
+        post: Post,
+        save_dir: Path
+    ):
+        """
+        게시글에 첨부된 이미지, 동영상, 음성 등 첨부 파일을 받아옵니다
+
+        :param post:        게시글 인스턴스
+        :param save_dir:    받아질 로컬 디렉터리 경로
+        """
+        urls = [
+            # 이미지
+            *[
+                tag['data-original'].strip()
+                for tag in post.body.select('img[data-original]')
+            ],
+
+            # 보이스 리플 및 동영상
+            *filter(
+                lambda url: (
+                    url.startswith('https://m.dcinside.com/voice/player') or
+                    url.startswith('https://m.dcinside.com/movie/player')
+                ),
+                [
+                    tag['src'].strip()
+                    for tag in post.body.select('iframe')
+                ]
+            )
+        ]
+
+        futures = [
+            self.download_attachment(url, save_dir)
+            for url in filter(
+                lambda x: x not in post.attachments,
+                urls
+            )
+        ]
+
+        async for future in asyncio.as_completed(futures):
+            # TODO: 오류 핸들링
+            attachment = await future
+            post.attachments[attachment.url] = attachment