1
0

feat: fetching attachment

This commit is contained in:
2025-08-04 11:35:08 +09:00
parent 02fd0ae13e
commit 5679ce1d6a
4 changed files with 169 additions and 12 deletions

View File

@@ -1,13 +1,21 @@
import re
import shutil
import hashlib
import mimetypes
import asyncio
import urllib.parse as urlparse
from typing import Optional, List
from pathlib import Path
from tempfile import NamedTemporaryFile
from datetime import datetime
from zoneinfo import ZoneInfo
from aiohttp import ClientSession
from aiohttp import ClientSession, ClientError
from bs4 import BeautifulSoup
from .typings import BoardPath, SearchType
from models import Post
from models import Attachment, Post
class Scraper(ClientSession):
@@ -27,7 +35,6 @@ class Scraper(ClientSession):
async def __aenter__(self) -> 'Scraper':
return self
async def list(
self,
boardId: str,
@@ -53,7 +60,6 @@ class Scraper(ClientSession):
:param search_position: 검색 지점
:param search_value: 검색어
"""
url = f'https://m.dcinside.com/{boardPath}/{boardId}'
params = {
'page': page,
@@ -84,7 +90,6 @@ class Scraper(ClientSession):
:param post: 조회할 게시글 인스턴스
"""
async with self.get(f'https://m.dcinside.com/{post.boardPath}/{post.boardId}/{post.id}') as response:
html = await response.text()
document = BeautifulSoup(html, 'lxml')
@@ -122,7 +127,9 @@ class Scraper(ClientSession):
# 제목과 말머리 파싱
post.title = titleTexts.pop().strip()
post.category = titleTexts.pop()[1:~1].strip() # [XX] -> XX
if titleTexts:
post.category = titleTexts.pop()[1:~1].strip() # [XX] -> XX
# 본문 파싱
post.body = document.select_one('.thum-txtin')
@@ -132,3 +139,132 @@ class Scraper(ClientSession):
tag.extract()
print(f'{post.boardId}/{post.id}: {post.title}')
async def fetch_voice(self, id: str):
"""
모바일 웹의 보이스 리플 iframe 페이지로부터 실제 파일 경로를 가져옵니다
:param id: 보이스 리플 아이디 (`vr` 인자)
"""
params = {
'vr': id,
'vr_open': 1
}
async with await self.get('https://m.dcinside.com/voice/player', params=params) as response:
html = await response.text()
document = BeautifulSoup(html, 'lxml')
return document.select_one('input')['value']
async def fetch_video(self, id: str):
"""
모바일 웹의 동영상 iframe 페이지로부터 실제 파일 경로를 가져옵니다
:param id: 동영상 아이디 (`no` 인자)
"""
params = {
'no': id
}
async with await self.get('https://m.dcinside.com/movie/player', params=params) as response:
html = await response.text()
document = BeautifulSoup(html, 'lxml')
return document.select_one('source')['src']
async def download_attachment(
self,
url: str,
save_dir: Path,
chunk_size = 8192
) -> Attachment:
"""
첨부 파일을 받아옵니다
:param url: 받아올 첨부 파일의 주소
:param save_dir: 받아질 로컬 디렉터리 경로
:param chunk_size: 청크 크기
"""
url_parsed = urlparse.urlparse(url)
url_params = urlparse.parse_qs(url_parsed.query)
hash = hashlib.sha1()
attachment = Attachment(
url=url,
source_url=url
)
if url.startswith('https://m.dcinside.com/voice/player'):
# 보이스 리플
attachment.source_url = await self.fetch_voice(url_params.get('vr'))
elif url.startswith('https://m.dcinside.com/movie/player'):
# 동영상
attachment.source_url = await self.fetch_video(url_params.get('no'))
with NamedTemporaryFile('wb') as temp_file:
async with await self.get(attachment.source_url) as response:
async for chunk in response.content.iter_chunked(chunk_size):
temp_file.write(chunk)
hash.update(chunk)
# Content-Type 헤더로부터 확장자 알아내기
attachment.source_suffix = mimetypes.guess_extension(response.content_type)
# Content-Disposition 헤더로부터 실제 파일 이름과 확장자 알아내기
if response.content_disposition and response.content_disposition.filename:
attachment.source_filename = response.content_disposition.filename
attachment.source_suffix = Path(attachment.source_filename).suffix
saved_path = save_dir / f'{hash.hexdigest()}{attachment.source_suffix}'
# 임시로 받은 파일 옮기기
if not saved_path.exists():
shutil.copyfile(temp_file.name, saved_path)
return attachment
async def download_attachments(
self,
post: Post,
save_dir: Path
):
"""
게시글에 첨부된 이미지, 동영상, 음성 등 첨부 파일을 받아옵니다
:param post: 게시글 인스턴스
:param save_dir: 받아질 로컬 디렉터리 경로
"""
urls = [
# 이미지
*[
tag['data-original'].strip()
for tag in post.body.select('img[data-original]')
],
# 보이스 리플 및 동영상
*filter(
lambda url: (
url.startswith('https://m.dcinside.com/voice/player') or
url.startswith('https://m.dcinside.com/movie/player')
),
[
tag['src'].strip()
for tag in post.body.select('iframe')
]
)
]
futures = [
self.download_attachment(url, save_dir)
for url in filter(
lambda x: x not in post.attachments,
urls
)
]
async for future in asyncio.as_completed(futures):
# TODO: 오류 핸들링
attachment = await future
post.attachments[attachment.url] = attachment