diff --git a/adapter.py b/adapter.py deleted file mode 100644 index 8d00758..0000000 --- a/adapter.py +++ /dev/null @@ -1,14 +0,0 @@ -from requests.adapters import HTTPAdapter - -class TimeoutHTTPAdapter(HTTPAdapter): - def __init__(self, *args, **kwargs): - if "timeout" in kwargs: - self.timeout = kwargs["timeout"] - del kwargs["timeout"] - super().__init__(*args, **kwargs) - - def send(self, request, **kwargs): - timeout = kwargs.get("timeout") - if timeout is None and hasattr(self, 'timeout'): - kwargs["timeout"] = self.timeout - return super().send(request, **kwargs) diff --git a/main.py b/main.py deleted file mode 100644 index 9bda43c..0000000 --- a/main.py +++ /dev/null @@ -1,36 +0,0 @@ -from pathlib import Path -from concurrent.futures import ThreadPoolExecutor, as_completed - -from requests.adapters import Retry - -from adapter import TimeoutHTTPAdapter -from scraper import Scraper - - -scraper = Scraper(concurrency=5) - -retries = Retry(total=5, backoff_factor=0.1, status_forcelist=[404]) -scraper.mount('https://', TimeoutHTTPAdapter(timeout=1, max_retries=retries)) - -document_path = Path('archives') -attachment_path = document_path / 'attachments' -attachment_path.mkdir(parents=True, exist_ok=True) - -try: - for offset in range(22760681, 22760681 + (10000 * 100), 10000): - with ThreadPoolExecutor() as executor: - posts = scraper.search('aoegame', 'name', -offset, 'ori') - print(f'searching offset {-offset}, found {len(posts)} post(s)') - as_completed([ - executor.submit( - scraper.view, - post, - document_path=document_path, - attachment_path=attachment_path - ) - for post in scraper.search('aoegame', 'name', -offset, 'ori') - ]) - - -except KeyboardInterrupt: - print(':-)') diff --git a/models/__init__.py b/models/__init__.py index 9835eb9..fe98780 100644 --- a/models/__init__.py +++ b/models/__init__.py @@ -1 +1 @@ -from .post import * +from .post import Post diff --git a/models/post.py b/models/post.py index c2c7356..6242d28 100644 --- a/models/post.py +++ b/models/post.py @@ -4,11 +4,14 @@ from datetime import datetime from bs4 import Tag +from utils.typings import BoardPath + @dataclass class Post: + id: int boardId: str - postId: int + boardPath: BoardPath authorId: Optional[str] = None authorName: Optional[str] = None category: Optional[str] = None diff --git a/requirements.txt b/requirements.txt index 41b8b2d..cdb28f4 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,8 +1,16 @@ +aiohappyeyeballs==2.6.1 +aiohttp==3.12.15 +aiosignal==1.4.0 +attrs==25.3.0 beautifulsoup4==4.13.4 certifi==2025.7.14 charset-normalizer==3.4.2 +frozenlist==1.7.0 idna==3.10 -requests==2.32.4 +lxml==6.0.0 +multidict==6.6.3 +propcache==0.3.2 soupsieve==2.7 typing_extensions==4.14.1 urllib3==2.5.0 +yarl==1.20.1 diff --git a/scraper.py b/scraper.py index ac9994a..690b7fd 100644 --- a/scraper.py +++ b/scraper.py @@ -1,145 +1,22 @@ -import re -import hashlib +import asyncio -from typing import Tuple, List -from datetime import datetime -from zoneinfo import ZoneInfo from pathlib import Path -from tempfile import NamedTemporaryFile -from threading import Semaphore -from concurrent.futures import ThreadPoolExecutor, as_completed - -from requests import Session -from requests.adapters import Retry -from requests.cookies import create_cookie - -from bs4 import BeautifulSoup - -from cgi import parse_header -from models import Post +from utils.middlewares import SemaphoreMiddleware +from utils.scraper import Scraper -class Scraper(Session): +async def main(): + middlewares = ( + SemaphoreMiddleware(5), + ) - def __init__(self, concurrency = 5): - super().__init__() - - self.semaphore = Semaphore(concurrency) - - self.headers['User-Agent'] = '(Android)' - self.headers['Referer'] = 'https://m.dcinside.com/board/6974gay' - - self.cookies.set_cookie(create_cookie( - name='list_count', - value='200', - domain='.dcinside.com' - )) - - def download_attachment(self, url: str, save_dir: Path) -> Tuple[str, Path]: - with self.semaphore: - res = self.get(url, stream=True) - res.raise_for_status() - - hash = hashlib.sha1() - - # fuck this shit - _, parts = parse_header(res.headers.get('Content-Disposition')) - fname = parts.get('filename', '') - fext = fname.split('.').pop() - - with NamedTemporaryFile('wb', dir=save_dir) as file: - for chunk in res.iter_content(chunk_size=8192): - if chunk: - file.write(chunk) - hash.update(chunk) - - return url, Path(file.name).rename(save_dir / f'{hash.hexdigest()}.{fext}') - - - def replace_attachment(self, post: Post, save_dir: Path): - src_to_tags = { - img.attrs['data-original'].strip(): img - for img in post.body.select('img[data-original]') - } - - with ThreadPoolExecutor() as executor: - futures = [ - executor.submit(self.download_attachment, src, save_dir) - for src in src_to_tags.keys() - ] - - for future in as_completed(futures): - src, path = future.result() - src_to_tags[src]['src'] = path - - def search(self, boardId: str, type: str, offset: int, value: str) -> List[Post]: - with self.semaphore: - res = self.get( - f'https://m.dcinside.com/board/{boardId}', - params={ - 's_type': type, - 's_pos': offset, - 'serval': value - }) - res.raise_for_status() - - document = BeautifulSoup(res.text, 'html.parser') - return [ - Post( - boardId=boardId, - postId=int(re.findall(r'/\d+', tag.attrs.get('href'))[0][1:]) - ) - for tag in document.select('.gall-detail-lnktb a[href]:first-child') - ] - - def view(self, post: Post, document_path: Path = None, attachment_path: Path = None) -> Post: - with self.semaphore: - res = self.get(f'https://m.dcinside.com/board/{post.boardId}/{post.postId}') - res.raise_for_status() + async with Scraper(middlewares=middlewares) as scraper: + posts = await scraper.list('roh', 'person') - document = BeautifulSoup(res.text, 'html.parser') + for future in asyncio.as_completed([scraper.view(p) for p in posts]): + await future - titleWrapTags = document.select('.gallview-tit-box .ginfo2 > li') - timeTag = titleWrapTags.pop() - authorTag = titleWrapTags.pop() - authorAnchorTag = authorTag.select_one('a') - - titleParts = ( - document - .select_one('.gallview-tit-box .tit') - .get_text(strip=True) - .split('\r\n') - ) - - post.title = titleParts.pop().strip() - post.category = titleParts.pop()[1:-1].strip() - - if authorAnchorTag: - post.authorId = re.findall(r'\/\w+$', authorAnchorTag.attrs.get('href'))[0][1:] - post.authorName = authorAnchorTag.get_text(strip=True) - else: - authorParts = authorTag.get_text(strip=True).split('(') - post.authorId = authorParts[1][:-1].strip() - post.authorName = authorParts[0].strip() - - post.created_at = ( - datetime - .strptime(timeTag.get_text(strip=True), '%Y.%m.%d %H:%M') - .replace(tzinfo=ZoneInfo('Asia/Seoul')) - ) - - post.body = document.select_one('.thum-txtin') - - # ㅋㅋㅋㅋz - if post.authorName != 'ori': - return - - print(f'yoinked {post.boardId}/{post.postId}, written by {post.authorName}: {post.title}') - - if attachment_path: - self.replace_attachment(post, attachment_path) - - if document_path: - (document_path / f'{post.boardId}_{post.postId}.html').write_text(str(post.body)) +if __name__ == '__main__': + asyncio.run(main()) diff --git a/utils/__init__.py b/utils/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/utils/middlewares.py b/utils/middlewares.py new file mode 100644 index 0000000..785d9c7 --- /dev/null +++ b/utils/middlewares.py @@ -0,0 +1,8 @@ +import asyncio + +from aiohttp import ClientRequest, ClientResponse, ClientHandlerType + +class SemaphoreMiddleware(asyncio.Semaphore): + async def __call__(self, req: ClientRequest, handler: ClientHandlerType) -> ClientResponse: + async with self: + return await handler(req) diff --git a/utils/scraper.py b/utils/scraper.py new file mode 100644 index 0000000..43e6ac8 --- /dev/null +++ b/utils/scraper.py @@ -0,0 +1,134 @@ +import re +from typing import Optional, List +from datetime import datetime +from zoneinfo import ZoneInfo + +from aiohttp import ClientSession +from bs4 import BeautifulSoup + +from .typings import BoardPath, SearchType +from models import Post + + +class Scraper(ClientSession): + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + # 모바일 페이지 및 첨부 파일 요청 시 필요한 기본 헤더 값 + self.headers['User-Agent'] = '(Android)' + self.headers['Referer'] = 'https://m.dcinside.com/board/aoegame' + + # 게시글 목록 조회로 한 번에 불러올 항목 수 + self.cookie_jar.update_cookies({ + 'list_count': '200' + }) + + async def __aenter__(self) -> 'Scraper': + return self + + + async def list( + self, + boardId: str, + boardPath: BoardPath = 'board', + page: int = 1, + categoryId: int = 0, + only_recommended: bool = False, + only_notice: bool = False, + search_type: Optional[SearchType] = None, + search_position: Optional[int] = None, + search_value: Optional[str] = None + ) -> List[Post]: + """ + 특정 게시판으로부터 특정 조건에 맞는 게시글 목록을 가져옵니다 + + :param boardId: 게시판 아이디 + :param boardPath: 게시판 경로(종류) + :param page: 페이지 번호 + :param categoryId: 말머리 아이디 + :param only_recommended: 개념글 게시글만 조회할지? + :param only_notice: 공지 게시글만 조회할지? + :param search_type: 검색 종류 + :param search_position: 검색 지점 + :param search_value: 검색어 + """ + + url = f'https://m.dcinside.com/{boardPath}/{boardId}' + params = { + 'page': page, + 'headid': categoryId, + 'recommend': only_recommended and '1' or '0', + 'notice': only_notice and '1' or '0', + 's_type': search_type or '', + 's_pos': search_position or '', + 'serval': search_value or '' + } + + async with self.get(url, params=params) as response: + html = await response.text() + document = BeautifulSoup(html, 'lxml') + + return [ + Post( + id=int(re.findall(r'/\d+', tag.select_one('a[href]:first-child')['href'])[0][1:]), + boardId=boardId, + boardPath=boardPath + ) + for tag in document.select('.gall-detail-lnktb') + ] + + async def view(self, post: Post): + """ + 게시글 내용을 조회합니다 + + :param post: 조회할 게시글 인스턴스 + """ + + async with self.get(f'https://m.dcinside.com/{post.boardPath}/{post.boardId}/{post.id}') as response: + html = await response.text() + document = BeautifulSoup(html, 'lxml') + + # 상단 제목 요소는 `li`로 나누어져있고 무슨 지랄을 해도 정확히 2개임 + # 만약 아니라면 어처피 파싱 무결성 전체가 깨질테니 예외 처리는 나도 몰?루 + authorTag, timestampTag, *_ = document.select('.gallview-tit-box .ginfo2 > li') + authorAnchorTag = authorTag.select_one('a') + + # 작성일 파싱 + post.created_at = ( + datetime + .strptime(timestampTag.get_text(strip=True), '%Y.%m.%d %H:%M') + .replace(tzinfo=ZoneInfo('Asia/Seoul')) + + ) + + # 작성자 정보 파싱 + if authorAnchorTag: + # 작성자 요소에 앵커 태그가 있다면 갤로그가 존재하는 상태임 + post.authorId = re.findall(r'\/\w+$', authorAnchorTag['href'])[0][1:] + post.authorName = authorAnchorTag.get_text(strip=True) + else: + authorParts = authorTag.get_text(strip=True).split('(') + post.authorId = authorParts.pop()[:-1].strip() # 123.123) -> 123.123 + post.authorName = authorParts.pop().strip() + + # 모바일 웹에서 말머리와 제목은 `\n`으로 분리되어있음 + titleTexts = ( + document + .select_one('.gallview-tit-box .tit') + .get_text(strip=True) + .split('\n') + ) + + # 제목과 말머리 파싱 + post.title = titleTexts.pop().strip() + post.category = titleTexts.pop()[1:~1].strip() # [XX] -> XX + + # 본문 파싱 + post.body = document.select_one('.thum-txtin') + + # 불필요한 본문 요소 제거 + for tag in post.body.select('script, style'): + tag.extract() + + print(f'{post.boardId}/{post.id}: {post.title}') diff --git a/utils/typings.py b/utils/typings.py new file mode 100644 index 0000000..9eda47e --- /dev/null +++ b/utils/typings.py @@ -0,0 +1,6 @@ +from typing import Literal + + +BoardPath = Literal['board', 'mini', 'person'] + +SearchType = Literal['subject_m', 'subject', 'memo', 'name', 'comment']