From 5679ce1d6a84cd1051299490acb0ea4abd5b971a Mon Sep 17 00:00:00 2001 From: Sangha Lee Date: Mon, 4 Aug 2025 11:35:08 +0900 Subject: [PATCH] feat: fetching attachment --- models/__init__.py | 2 +- models/post.py | 15 ++++- scraper.py | 16 ++++- utils/scraper.py | 148 +++++++++++++++++++++++++++++++++++++++++++-- 4 files changed, 169 insertions(+), 12 deletions(-) diff --git a/models/__init__.py b/models/__init__.py index fe98780..6d6cde1 100644 --- a/models/__init__.py +++ b/models/__init__.py @@ -1 +1 @@ -from .post import Post +from .post import Attachment, Post diff --git a/models/post.py b/models/post.py index 6242d28..70f1a7f 100644 --- a/models/post.py +++ b/models/post.py @@ -1,5 +1,5 @@ -from typing import Optional -from dataclasses import dataclass +from typing import Optional, Dict +from dataclasses import dataclass, field from datetime import datetime from bs4 import Tag @@ -7,6 +7,16 @@ from bs4 import Tag from utils.typings import BoardPath +@dataclass +class Attachment: + url: str + source_url: str + source_filename: Optional[str] = None + source_suffix: Optional[str] = None + hash: Optional[str] = None + error: Optional[str] = None + + @dataclass class Post: id: int @@ -17,4 +27,5 @@ class Post: category: Optional[str] = None title: Optional[str] = None body: Optional[Tag] = None + attachments: Dict[str, Attachment] = field(default_factory=lambda: {}) created_at: Optional[datetime] = None diff --git a/scraper.py b/scraper.py index 690b7fd..03efb79 100644 --- a/scraper.py +++ b/scraper.py @@ -1,21 +1,31 @@ +import json import asyncio from pathlib import Path from utils.middlewares import SemaphoreMiddleware from utils.scraper import Scraper +from models.post import Post +archive_dir = Path('archives') + async def main(): middlewares = ( SemaphoreMiddleware(5), ) async with Scraper(middlewares=middlewares) as scraper: - posts = await scraper.list('roh', 'person') + post = Post( + id=2341247, + boardId='event_voicere', + boardPath='board' + ) + + await scraper.view(post) + await scraper.download_attachments(post, archive_dir) - for future in asyncio.as_completed([scraper.view(p) for p in posts]): - await future + print(post) if __name__ == '__main__': diff --git a/utils/scraper.py b/utils/scraper.py index 3d1e7a5..a267985 100644 --- a/utils/scraper.py +++ b/utils/scraper.py @@ -1,13 +1,21 @@ import re +import shutil +import hashlib +import mimetypes +import asyncio +import urllib.parse as urlparse + from typing import Optional, List +from pathlib import Path +from tempfile import NamedTemporaryFile from datetime import datetime from zoneinfo import ZoneInfo -from aiohttp import ClientSession +from aiohttp import ClientSession, ClientError from bs4 import BeautifulSoup from .typings import BoardPath, SearchType -from models import Post +from models import Attachment, Post class Scraper(ClientSession): @@ -27,7 +35,6 @@ class Scraper(ClientSession): async def __aenter__(self) -> 'Scraper': return self - async def list( self, boardId: str, @@ -53,7 +60,6 @@ class Scraper(ClientSession): :param search_position: 검색 지점 :param search_value: 검색어 """ - url = f'https://m.dcinside.com/{boardPath}/{boardId}' params = { 'page': page, @@ -84,7 +90,6 @@ class Scraper(ClientSession): :param post: 조회할 게시글 인스턴스 """ - async with self.get(f'https://m.dcinside.com/{post.boardPath}/{post.boardId}/{post.id}') as response: html = await response.text() document = BeautifulSoup(html, 'lxml') @@ -122,7 +127,9 @@ class Scraper(ClientSession): # 제목과 말머리 파싱 post.title = titleTexts.pop().strip() - post.category = titleTexts.pop()[1:~1].strip() # [XX] -> XX + + if titleTexts: + post.category = titleTexts.pop()[1:~1].strip() # [XX] -> XX # 본문 파싱 post.body = document.select_one('.thum-txtin') @@ -132,3 +139,132 @@ class Scraper(ClientSession): tag.extract() print(f'{post.boardId}/{post.id}: {post.title}') + + async def fetch_voice(self, id: str): + """ + 모바일 웹의 보이스 리플 iframe 페이지로부터 실제 파일 경로를 가져옵니다 + + :param id: 보이스 리플 아이디 (`vr` 인자) + """ + params = { + 'vr': id, + 'vr_open': 1 + } + + async with await self.get('https://m.dcinside.com/voice/player', params=params) as response: + html = await response.text() + document = BeautifulSoup(html, 'lxml') + + return document.select_one('input')['value'] + + async def fetch_video(self, id: str): + """ + 모바일 웹의 동영상 iframe 페이지로부터 실제 파일 경로를 가져옵니다 + + :param id: 동영상 아이디 (`no` 인자) + """ + params = { + 'no': id + } + + async with await self.get('https://m.dcinside.com/movie/player', params=params) as response: + html = await response.text() + document = BeautifulSoup(html, 'lxml') + + return document.select_one('source')['src'] + + async def download_attachment( + self, + url: str, + save_dir: Path, + chunk_size = 8192 + ) -> Attachment: + """ + 첨부 파일을 받아옵니다 + + :param url: 받아올 첨부 파일의 주소 + :param save_dir: 받아질 로컬 디렉터리 경로 + :param chunk_size: 청크 크기 + """ + url_parsed = urlparse.urlparse(url) + url_params = urlparse.parse_qs(url_parsed.query) + + hash = hashlib.sha1() + attachment = Attachment( + url=url, + source_url=url + ) + + if url.startswith('https://m.dcinside.com/voice/player'): + # 보이스 리플 + attachment.source_url = await self.fetch_voice(url_params.get('vr')) + + elif url.startswith('https://m.dcinside.com/movie/player'): + # 동영상 + attachment.source_url = await self.fetch_video(url_params.get('no')) + + with NamedTemporaryFile('wb') as temp_file: + async with await self.get(attachment.source_url) as response: + async for chunk in response.content.iter_chunked(chunk_size): + temp_file.write(chunk) + hash.update(chunk) + + # Content-Type 헤더로부터 확장자 알아내기 + attachment.source_suffix = mimetypes.guess_extension(response.content_type) + + # Content-Disposition 헤더로부터 실제 파일 이름과 확장자 알아내기 + if response.content_disposition and response.content_disposition.filename: + attachment.source_filename = response.content_disposition.filename + attachment.source_suffix = Path(attachment.source_filename).suffix + + saved_path = save_dir / f'{hash.hexdigest()}{attachment.source_suffix}' + + # 임시로 받은 파일 옮기기 + if not saved_path.exists(): + shutil.copyfile(temp_file.name, saved_path) + + return attachment + + async def download_attachments( + self, + post: Post, + save_dir: Path + ): + """ + 게시글에 첨부된 이미지, 동영상, 음성 등 첨부 파일을 받아옵니다 + + :param post: 게시글 인스턴스 + :param save_dir: 받아질 로컬 디렉터리 경로 + """ + urls = [ + # 이미지 + *[ + tag['data-original'].strip() + for tag in post.body.select('img[data-original]') + ], + + # 보이스 리플 및 동영상 + *filter( + lambda url: ( + url.startswith('https://m.dcinside.com/voice/player') or + url.startswith('https://m.dcinside.com/movie/player') + ), + [ + tag['src'].strip() + for tag in post.body.select('iframe') + ] + ) + ] + + futures = [ + self.download_attachment(url, save_dir) + for url in filter( + lambda x: x not in post.attachments, + urls + ) + ] + + async for future in asyncio.as_completed(futures): + # TODO: 오류 핸들링 + attachment = await future + post.attachments[attachment.url] = attachment