migrate to aiohttp

2025-08-04 09:56:02 +09:00
parent 540a84e772
commit 8ad93caa90
10 changed files with 175 additions and 189 deletions
--- a/scraper.py
+++ b/scraper.py
@@ -1,145 +1,22 @@
-import re
-import hashlib
+import asyncio

-from typing import Tuple, List
-from datetime import datetime
-from zoneinfo import ZoneInfo
 from pathlib import Path
-from tempfile import NamedTemporaryFile

-from threading import Semaphore
-from concurrent.futures import ThreadPoolExecutor, as_completed
-
-from requests import Session
-from requests.adapters import Retry
-from requests.cookies import create_cookie
-
-from bs4 import BeautifulSoup
-
-from cgi import parse_header
-from models import Post
+from utils.middlewares import SemaphoreMiddleware
+from utils.scraper import Scraper


-class Scraper(Session):
+async def main():
+    middlewares = (
+        SemaphoreMiddleware(5),
+    )

-    def __init__(self, concurrency = 5):
-        super().__init__()
-
-        self.semaphore = Semaphore(concurrency)
-
-        self.headers['User-Agent'] = '(Android)'
-        self.headers['Referer'] = 'https://m.dcinside.com/board/6974gay'
-
-        self.cookies.set_cookie(create_cookie(
-            name='list_count',
-            value='200',
-            domain='.dcinside.com'
-        ))
-
-    def download_attachment(self, url: str, save_dir: Path) -> Tuple[str, Path]:
-        with self.semaphore:
-            res = self.get(url, stream=True)
-            res.raise_for_status()
-
-            hash = hashlib.sha1()
-            
-            # fuck this shit
-            _, parts = parse_header(res.headers.get('Content-Disposition'))
-            fname = parts.get('filename', '')
-            fext = fname.split('.').pop()
-
-            with NamedTemporaryFile('wb', dir=save_dir) as file:
-                for chunk in res.iter_content(chunk_size=8192):
-                    if chunk:
-                        file.write(chunk)
-                        hash.update(chunk)
-
-                return url, Path(file.name).rename(save_dir / f'{hash.hexdigest()}.{fext}')
-            
-    
-    def replace_attachment(self, post: Post, save_dir: Path):
-        src_to_tags = {
-            img.attrs['data-original'].strip(): img
-            for img in post.body.select('img[data-original]')
-        }
-
-        with ThreadPoolExecutor() as executor:
-            futures = [
-                executor.submit(self.download_attachment, src, save_dir)
-                for src in src_to_tags.keys()
-            ]
-
-            for future in as_completed(futures):
-                src, path = future.result()
-                src_to_tags[src]['src'] = path
-
-    def search(self, boardId: str, type: str, offset: int, value: str) -> List[Post]:
-        with self.semaphore:
-            res = self.get(
-                f'https://m.dcinside.com/board/{boardId}',
-                params={
-                    's_type': type,
-                    's_pos': offset,
-                    'serval': value
-                })
-            res.raise_for_status()
-   
-        document = BeautifulSoup(res.text, 'html.parser')
-        return [
-            Post(
-                boardId=boardId, 
-                postId=int(re.findall(r'/\d+', tag.attrs.get('href'))[0][1:])
-            )
-            for tag in document.select('.gall-detail-lnktb a[href]:first-child')
-        ]
-
-    def view(self, post: Post, document_path: Path = None, attachment_path: Path = None) -> Post:
-        with self.semaphore:
-            res = self.get(f'https://m.dcinside.com/board/{post.boardId}/{post.postId}')
-            res.raise_for_status()
+    async with Scraper(middlewares=middlewares) as scraper:
+        posts = await scraper.list('roh', 'person')
        
-        document = BeautifulSoup(res.text, 'html.parser')
+        for future in asyncio.as_completed([scraper.view(p) for p in posts]):
+            await future

-        titleWrapTags = document.select('.gallview-tit-box .ginfo2 > li')

-        timeTag = titleWrapTags.pop()
-        authorTag = titleWrapTags.pop()
-        authorAnchorTag = authorTag.select_one('a')
-
-        titleParts = (
-            document
-                .select_one('.gallview-tit-box .tit')
-                .get_text(strip=True)
-                .split('\r\n')
-        )
-
-        post.title = titleParts.pop().strip()
-        post.category = titleParts.pop()[1:-1].strip()
-
-        if authorAnchorTag:
-            post.authorId = re.findall(r'\/\w+$', authorAnchorTag.attrs.get('href'))[0][1:]
-            post.authorName = authorAnchorTag.get_text(strip=True)
-        else:
-            authorParts = authorTag.get_text(strip=True).split('(')
-            post.authorId = authorParts[1][:-1].strip()
-            post.authorName = authorParts[0].strip()
-
-        post.created_at = (
-            datetime
-                .strptime(timeTag.get_text(strip=True), '%Y.%m.%d %H:%M')
-                .replace(tzinfo=ZoneInfo('Asia/Seoul'))
-        )
-
-        post.body = document.select_one('.thum-txtin')
-
-        # ㅋㅋㅋㅋz
-        if post.authorName != 'ori':
-            return
-
-        print(f'yoinked {post.boardId}/{post.postId}, written by {post.authorName}: {post.title}')
-
-        if attachment_path:
-            self.replace_attachment(post, attachment_path)
-
-        if document_path:
-            (document_path / f'{post.boardId}_{post.postId}.html').write_text(str(post.body))
+if __name__ == '__main__':
+    asyncio.run(main())