mandu-crawler/utils/scraper.py

import re
import shutil
import hashlib
import mimetypes
import asyncio
import urllib.parse as urlparse

from typing import Optional, List
from pathlib import Path
from tempfile import NamedTemporaryFile
from datetime import datetime
from zoneinfo import ZoneInfo

from aiohttp import ClientSession, ClientError
from bs4 import BeautifulSoup

from .typings import BoardPath, SearchType
from models import Attachment, Post


LIST_MAX_POSTS = 200


class Scraper(ClientSession):

    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)

        # 모바일 페이지 및 첨부 파일 요청 시 필요한 기본 헤더 값
        self.headers['Accept'] = '*/*'
        self.headers['User-Agent'] = '(Android)'
        self.headers['Referer'] = 'https://m.dcinside.com/board/aoegame'

        # 게시글 목록 조회로 한 번에 불러올 항목 수
        self.cookie_jar.update_cookies({
            'list_count': LIST_MAX_POSTS
        })

    async def __aenter__(self) -> 'Scraper':
        return self

    async def list(
        self,
        board_id: str,
        board_path: BoardPath = 'board',
        page: int = 1,
        category_id: int = 0,
        only_recommended: bool = False,
        only_notice: bool = False,
        search_type: Optional[SearchType] = None,
        search_position: Optional[int] = None,
        search_value: Optional[str] = None
    ) -> List[Post]:
        """
        특정 게시판으로부터 특정 조건에 맞는 게시글 목록을 가져옵니다

        :param board_id:            게시판 아이디
        :param board_path:          게시판 경로(종류)
        :param page:                페이지 번호
        :param category_id:         말머리 아이디
        :param only_recommended:    개념글 게시글만 조회할지?
        :param only_notice:         공지 게시글만 조회할지?
        :param search_type:         검색 종류
        :param search_position:     검색 지점
        :param search_value:        검색어
        """
        url = f'https://m.dcinside.com/{board_path}/{board_id}'
        params = {
            'page': page,
            'headid': category_id,
            'recommend': only_recommended and '1' or '0',
            'notice': only_notice and '1' or '0',
            's_type': search_type or '',
            's_pos': search_position or '',
            'serval': search_value or ''
        }

        async with self.get(url, params=params) as response:
            html = await response.text()
            document = BeautifulSoup(html, 'lxml')

        return [
            Post(
                id=int(re.findall(r'/\d+', tag.select_one('a[href]')['href'])[0][1:]),
                board_id=board_id,
                board_path=board_path
            )
            for tag in document.select('.gall-detail-lnktb')
        ]

    async def view(self, post: Post) -> Post:
        """
        게시글 내용을 조회합니다

        :param post:    조회할 게시글 인스턴스
        """
        async with self.get(f'https://m.dcinside.com/{post.board_path}/{post.board_id}/{post.id}') as response:
            html = await response.text()
            document = BeautifulSoup(html, 'lxml')

        # 상단 제목 요소는 `li`로 나누어져있고 무슨 지랄을 해도 정확히 2개임
        # 만약 아니라면 어처피 파싱 무결성 전체가 깨질테니 예외 처리는 나도 몰?루
        author_tag, timestamp_tag, *_ = document.select('.gallview-tit-box .ginfo2 > li')
        author_anchor_tag = author_tag.select_one('a')

        # 작성일 파싱
        post.created_at = (
            datetime
                .strptime(timestamp_tag.get_text(strip=True), '%Y.%m.%d %H:%M')
                .replace(tzinfo=ZoneInfo('Asia/Seoul'))

        )

        # 작성자 정보 파싱
        if author_anchor_tag:
            # 작성자 요소에 앵커 태그가 있다면 갤로그가 존재하는 상태임
            post.author_id = re.findall(r'\/\w+$', author_anchor_tag['href'])[0][1:]
            post.author_name = author_anchor_tag.get_text(strip=True)
        else:
            author_parts = author_tag.get_text(strip=True).split('(')
            post.author_id = author_parts.pop()[:-1].strip() # 123.123) -> 123.123
            post.author_name = author_parts.pop().strip()

        # 모바일 웹에서 말머리와 제목은 `\n`으로 분리되어있음
        title_texts = (
            document
                .select_one('.gallview-tit-box .tit')
                .get_text(strip=True)
                .split('\n')
        )

        # 제목과 말머리 파싱
        post.title = title_texts.pop().strip()

        if title_texts:
            post.category = title_texts.pop()[1:~1].strip() # [XX] -> XX

        # 본문 파싱
        post.body = document.select_one('.thum-txtin')

        # 불필요한 본문 요소 제거
        for tag in post.body.select('script, style'):
            tag.extract()

        return post

    async def fetch_voice(self, id: str):
        """
        모바일 웹의 보이스 리플 iframe 페이지로부터 실제 파일 경로를 가져옵니다

        :param id: 보이스 리플 아이디 (`vr` 인자)
        """
        params = {
            'vr': id,
            'vr_open': 1
        }

        async with await self.get('https://m.dcinside.com/voice/player', params=params) as response:
            html = await response.text()
            document = BeautifulSoup(html, 'lxml')

        return document.select_one('input')['value']

    async def fetch_video(self, id: str):
        """
        모바일 웹의 동영상 iframe 페이지로부터 실제 파일 경로를 가져옵니다

        :param id: 동영상 아이디 (`no` 인자)
        """
        params = {
            'no': id
        }

        async with await self.get('https://m.dcinside.com/movie/player', params=params) as response:
            html = await response.text()
            document = BeautifulSoup(html, 'lxml')

        return document.select_one('source')['src']

    async def download_attachment(self, url: str, save_dir: Path) -> Attachment:
        """
        첨부 파일을 받아옵니다

        :param url:         받아올 첨부 파일의 주소
        :param save_dir:    받아질 로컬 디렉터리 경로
        """
        url_parsed = urlparse.urlparse(url)
        url_params = urlparse.parse_qs(url_parsed.query)

        hash = hashlib.sha1()
        attachment = Attachment(
            url=url,
            source_url=url
        )

        if url.startswith('https://m.dcinside.com/voice/player'):
            # 보이스 리플
            attachment.source_url = await self.fetch_voice(url_params.get('vr'))

        elif url.startswith('https://m.dcinside.com/movie/player'):
            # 동영상
            attachment.source_url = await self.fetch_video(url_params.get('no'))

        with NamedTemporaryFile('wb') as temp_file:
            async with await self.get(attachment.source_url) as response:
                async for chunk, _ in response.content.iter_chunks():
                    temp_file.write(chunk)
                    hash.update(chunk)
                temp_file.flush()

            # Content-Type 헤더로부터 확장자 알아내기
            attachment.source_suffix = mimetypes.guess_extension(response.content_type)

            # Content-Disposition 헤더로부터 실제 파일 이름과 확장자 알아내기
            if response.content_disposition and response.content_disposition.filename:
                attachment.source_filename = response.content_disposition.filename
                attachment.source_suffix = Path(attachment.source_filename).suffix

            attachment.hash = hash.hexdigest()

            saved_path = save_dir / f'{attachment.hash}{attachment.source_suffix}'

            # 임시로 받은 파일 옮기기
            if not saved_path.exists():
                shutil.copyfile(temp_file.name, saved_path)

        return attachment

    async def download_attachments(
        self,
        post: Post,
        save_dir: Path
    ):
        """
        게시글에 첨부된 이미지, 동영상, 음성 등 첨부 파일을 받아옵니다

        :param post:        게시글 인스턴스
        :param save_dir:    받아질 로컬 디렉터리 경로
        """
        urls = [
            # 이미지
            *[
                # 움짤은 자동 변환 후 `data-gif`와 `data-mp4`로 반환됨
                # TODO: bad code, clean shit up
                tag.attrs.get('data-mp4', tag['data-original']).strip()
                for tag in post.body.select('img[data-original]')
            ],

            # 보이스 리플 및 동영상
            *filter(
                lambda url: (
                    url.startswith('https://m.dcinside.com/voice/player') or
                    url.startswith('https://m.dcinside.com/movie/player')
                ),
                [
                    tag['src'].strip()
                    for tag in post.body.select('iframe')
                ]
            )
        ]

        futures = [
            self.download_attachment(url, save_dir)
            for url in filter(
                lambda x: x not in post.attachments,
                urls
            )
        ]

        async for future in asyncio.as_completed(futures):
            # TODO: 오류 핸들링
            attachment = await future
            post.attachments[attachment.url] = attachment