fix: missing chunk, fr this time

fix: missing chunk from attachment files
fix: remove hardcoded integer
2025-08-04 14:41:21 +09:00 · 2025-08-04 13:50:15 +09:00 · 2025-08-04 13:31:48 +09:00
1 changed files with 15 additions and 14 deletions
--- a/utils/scraper.py
+++ b/utils/scraper.py
@@ -18,18 +18,22 @@ from .typings import BoardPath, SearchType
 from models import Attachment, Post


+LIST_MAX_POSTS = 200
+
+
 class Scraper(ClientSession):

    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)

        # 모바일 페이지 및 첨부 파일 요청 시 필요한 기본 헤더 값
+        self.headers['Accept'] = '*/*'
        self.headers['User-Agent'] = '(Android)'
        self.headers['Referer'] = 'https://m.dcinside.com/board/aoegame'

        # 게시글 목록 조회로 한 번에 불러올 항목 수
        self.cookie_jar.update_cookies({
-            'list_count': '200'
+            'list_count': LIST_MAX_POSTS
        })

    async def __aenter__(self) -> 'Scraper':
@@ -77,14 +81,14 @@ class Scraper(ClientSession):

        return [
            Post(
-                id=int(re.findall(r'/\d+', tag.select_one('a[href]:first-child')['href'])[0][1:]),
+                id=int(re.findall(r'/\d+', tag.select_one('a[href]')['href'])[0][1:]),
                board_id=board_id,
                board_path=board_path
            )
            for tag in document.select('.gall-detail-lnktb')
        ]
    
-    async def view(self, post: Post):
+    async def view(self, post: Post) -> Post:
        """
        게시글 내용을 조회합니다

@@ -137,8 +141,8 @@ class Scraper(ClientSession):
        # 불필요한 본문 요소 제거
        for tag in post.body.select('script, style'):
            tag.extract()
-
-        print(f'{post.board_id}/{post.id}: {post.title}')
+            
+        return post

    async def fetch_voice(self, id: str):
        """
@@ -173,18 +177,12 @@ class Scraper(ClientSession):

        return document.select_one('source')['src']

-    async def download_attachment(
-        self, 
-        url: str, 
-        save_dir: Path,
-        chunk_size = 8192
-    ) -> Attachment:
+    async def download_attachment(self, url: str, save_dir: Path) -> Attachment:
        """
        첨부 파일을 받아옵니다

        :param url:         받아올 첨부 파일의 주소
        :param save_dir:    받아질 로컬 디렉터리 경로
-        :param chunk_size:  청크 크기
        """
        url_parsed = urlparse.urlparse(url)
        url_params = urlparse.parse_qs(url_parsed.query)
@@ -205,9 +203,10 @@ class Scraper(ClientSession):

        with NamedTemporaryFile('wb') as temp_file:
            async with await self.get(attachment.source_url) as response:
-                async for chunk in response.content.iter_chunked(chunk_size):
+                async for chunk, _ in response.content.iter_chunks():
                    temp_file.write(chunk)
                    hash.update(chunk)
+                temp_file.flush()

            # Content-Type 헤더로부터 확장자 알아내기
            attachment.source_suffix = mimetypes.guess_extension(response.content_type)
@@ -241,7 +240,9 @@ class Scraper(ClientSession):
        urls = [
            # 이미지
            *[
-                tag['data-original'].strip()
+                # 움짤은 자동 변환 후 `data-gif`와 `data-mp4`로 반환됨
+                # TODO: bad code, clean shit up
+                tag.attrs.get('data-mp4', tag['data-original']).strip()
                for tag in post.body.select('img[data-original]')
            ],
Author	SHA1	Message	Date
Sangha Lee	4efd92bcb0	fix: missing chunk, fr this time	2025-08-04 14:41:21 +09:00
Sangha Lee	753da82a48	fix: missing chunk from attachment files	2025-08-04 13:50:15 +09:00
Sangha Lee	f3344c57c3	fix: remove hardcoded integer	2025-08-04 13:31:48 +09:00