From 5679ce1d6a84cd1051299490acb0ea4abd5b971a Mon Sep 17 00:00:00 2001
From: Sangha Lee <aeon@vmm.pw>
Date: Mon, 4 Aug 2025 11:35:08 +0900
Subject: [PATCH] feat: fetching attachment

---
 models/__init__.py |   2 +-
 models/post.py     |  15 ++++-
 scraper.py         |  16 ++++-
 utils/scraper.py   | 148 +++++++++++++++++++++++++++++++++++++++++++--
 4 files changed, 169 insertions(+), 12 deletions(-)

diff --git a/models/__init__.py b/models/__init__.py
index fe98780..6d6cde1 100644
--- a/models/__init__.py
+++ b/models/__init__.py
@@ -1 +1 @@
-from .post import Post
+from .post import Attachment, Post
diff --git a/models/post.py b/models/post.py
index 6242d28..70f1a7f 100644
--- a/models/post.py
+++ b/models/post.py
@@ -1,5 +1,5 @@
-from typing import Optional
-from dataclasses import dataclass
+from typing import Optional, Dict
+from dataclasses import dataclass, field
 from datetime import datetime
 
 from bs4 import Tag
@@ -7,6 +7,16 @@ from bs4 import Tag
 from utils.typings import BoardPath
 
 
+@dataclass
+class Attachment:
+    url: str
+    source_url: str
+    source_filename: Optional[str] = None
+    source_suffix: Optional[str] = None
+    hash: Optional[str] = None
+    error: Optional[str] = None
+
+
 @dataclass
 class Post:
     id: int
@@ -17,4 +27,5 @@ class Post:
     category: Optional[str] = None
     title: Optional[str] = None
     body: Optional[Tag] = None
+    attachments: Dict[str, Attachment] = field(default_factory=lambda: {})
     created_at: Optional[datetime] = None
diff --git a/scraper.py b/scraper.py
index 690b7fd..03efb79 100644
--- a/scraper.py
+++ b/scraper.py
@@ -1,21 +1,31 @@
+import json
 import asyncio
 
 from pathlib import Path
 
 from utils.middlewares import SemaphoreMiddleware
 from utils.scraper import Scraper
+from models.post import Post
 
 
+archive_dir = Path('archives')
+
 async def main():
     middlewares = (
         SemaphoreMiddleware(5),
     )
 
     async with Scraper(middlewares=middlewares) as scraper:
-        posts = await scraper.list('roh', 'person')
+        post = Post(
+            id=2341247,
+            boardId='event_voicere',
+            boardPath='board'
+        )
+
+        await scraper.view(post)
+        await scraper.download_attachments(post, archive_dir)
         
-        for future in asyncio.as_completed([scraper.view(p) for p in posts]):
-            await future
+        print(post)
 
 
 if __name__ == '__main__':
diff --git a/utils/scraper.py b/utils/scraper.py
index 3d1e7a5..a267985 100644
--- a/utils/scraper.py
+++ b/utils/scraper.py
@@ -1,13 +1,21 @@
 import re
+import shutil
+import hashlib
+import mimetypes
+import asyncio
+import urllib.parse as urlparse
+
 from typing import Optional, List
+from pathlib import Path
+from tempfile import NamedTemporaryFile
 from datetime import datetime
 from zoneinfo import ZoneInfo
 
-from aiohttp import ClientSession
+from aiohttp import ClientSession, ClientError
 from bs4 import BeautifulSoup
 
 from .typings import BoardPath, SearchType
-from models import Post
+from models import Attachment, Post
 
 
 class Scraper(ClientSession):
@@ -27,7 +35,6 @@ class Scraper(ClientSession):
     async def __aenter__(self) -> 'Scraper':
         return self
 
-
     async def list(
         self, 
         boardId: str,
@@ -53,7 +60,6 @@ class Scraper(ClientSession):
         :param search_position:     검색 지점
         :param search_value:        검색어
         """
-
         url = f'https://m.dcinside.com/{boardPath}/{boardId}'
         params = {
             'page': page,
@@ -84,7 +90,6 @@ class Scraper(ClientSession):
 
         :param post:    조회할 게시글 인스턴스
         """
-        
         async with self.get(f'https://m.dcinside.com/{post.boardPath}/{post.boardId}/{post.id}') as response:
             html = await response.text()
             document = BeautifulSoup(html, 'lxml')
@@ -122,7 +127,9 @@ class Scraper(ClientSession):
 
         # 제목과 말머리 파싱
         post.title = titleTexts.pop().strip()
-        post.category = titleTexts.pop()[1:~1].strip() # [XX] -> XX
+
+        if titleTexts:
+            post.category = titleTexts.pop()[1:~1].strip() # [XX] -> XX
 
         # 본문 파싱
         post.body = document.select_one('.thum-txtin')
@@ -132,3 +139,132 @@ class Scraper(ClientSession):
             tag.extract()
 
         print(f'{post.boardId}/{post.id}: {post.title}')
+
+    async def fetch_voice(self, id: str):
+        """
+        모바일 웹의 보이스 리플 iframe 페이지로부터 실제 파일 경로를 가져옵니다
+
+        :param id: 보이스 리플 아이디 (`vr` 인자)
+        """
+        params = {
+            'vr': id,
+            'vr_open': 1
+        }
+
+        async with await self.get('https://m.dcinside.com/voice/player', params=params) as response:
+            html = await response.text()
+            document = BeautifulSoup(html, 'lxml')
+
+        return document.select_one('input')['value']
+
+    async def fetch_video(self, id: str):
+        """
+        모바일 웹의 동영상 iframe 페이지로부터 실제 파일 경로를 가져옵니다
+
+        :param id: 동영상 아이디 (`no` 인자)
+        """
+        params = {
+            'no': id
+        }
+
+        async with await self.get('https://m.dcinside.com/movie/player', params=params) as response:
+            html = await response.text()
+            document = BeautifulSoup(html, 'lxml')
+
+        return document.select_one('source')['src']
+
+    async def download_attachment(
+        self, 
+        url: str, 
+        save_dir: Path,
+        chunk_size = 8192
+    ) -> Attachment:
+        """
+        첨부 파일을 받아옵니다
+
+        :param url:         받아올 첨부 파일의 주소
+        :param save_dir:    받아질 로컬 디렉터리 경로
+        :param chunk_size:  청크 크기
+        """
+        url_parsed = urlparse.urlparse(url)
+        url_params = urlparse.parse_qs(url_parsed.query)
+
+        hash = hashlib.sha1()
+        attachment = Attachment(
+            url=url,
+            source_url=url
+        )
+
+        if url.startswith('https://m.dcinside.com/voice/player'):
+            # 보이스 리플
+            attachment.source_url = await self.fetch_voice(url_params.get('vr'))
+            
+        elif url.startswith('https://m.dcinside.com/movie/player'):
+            # 동영상
+            attachment.source_url = await self.fetch_video(url_params.get('no'))
+
+        with NamedTemporaryFile('wb') as temp_file:
+            async with await self.get(attachment.source_url) as response:
+                async for chunk in response.content.iter_chunked(chunk_size):
+                    temp_file.write(chunk)
+                    hash.update(chunk)
+
+            # Content-Type 헤더로부터 확장자 알아내기
+            attachment.source_suffix = mimetypes.guess_extension(response.content_type)
+            
+            # Content-Disposition 헤더로부터 실제 파일 이름과 확장자 알아내기
+            if response.content_disposition and response.content_disposition.filename:
+                attachment.source_filename = response.content_disposition.filename
+                attachment.source_suffix = Path(attachment.source_filename).suffix
+
+            saved_path = save_dir / f'{hash.hexdigest()}{attachment.source_suffix}'
+            
+            # 임시로 받은 파일 옮기기
+            if not saved_path.exists():
+                shutil.copyfile(temp_file.name, saved_path)
+
+        return attachment
+
+    async def download_attachments(
+        self,
+        post: Post,
+        save_dir: Path
+    ):
+        """
+        게시글에 첨부된 이미지, 동영상, 음성 등 첨부 파일을 받아옵니다
+
+        :param post:        게시글 인스턴스
+        :param save_dir:    받아질 로컬 디렉터리 경로
+        """
+        urls = [
+            # 이미지
+            *[
+                tag['data-original'].strip()
+                for tag in post.body.select('img[data-original]')
+            ],
+
+            # 보이스 리플 및 동영상
+            *filter(
+                lambda url: (
+                    url.startswith('https://m.dcinside.com/voice/player') or
+                    url.startswith('https://m.dcinside.com/movie/player')
+                ),
+                [
+                    tag['src'].strip()
+                    for tag in post.body.select('iframe')
+                ]
+            )
+        ]
+
+        futures = [
+            self.download_attachment(url, save_dir)
+            for url in filter(
+                lambda x: x not in post.attachments,
+                urls
+            )
+        ]
+
+        async for future in asyncio.as_completed(futures):
+            # TODO: 오류 핸들링
+            attachment = await future
+            post.attachments[attachment.url] = attachment