1
0

Compare commits

...

3 Commits

Author SHA1 Message Date
8ad93caa90 migrate to aiohttp 2025-08-04 09:56:02 +09:00
540a84e772 refactor structure for sane workflow :) 2025-08-04 08:20:59 +09:00
6dbc4a4e54 rename to sane class name 2025-08-04 08:16:44 +09:00
9 changed files with 191 additions and 193 deletions

View File

@@ -1,14 +0,0 @@
from requests.adapters import HTTPAdapter
class WhyTheFuckRequestsHasNoTimeoutInAdapter(HTTPAdapter):
def __init__(self, *args, **kwargs):
if "timeout" in kwargs:
self.timeout = kwargs["timeout"]
del kwargs["timeout"]
super().__init__(*args, **kwargs)
def send(self, request, **kwargs):
timeout = kwargs.get("timeout")
if timeout is None and hasattr(self, 'timeout'):
kwargs["timeout"] = self.timeout
return super().send(request, **kwargs)

1
models/__init__.py Normal file
View File

@@ -0,0 +1 @@
from .post import Post

20
models/post.py Normal file
View File

@@ -0,0 +1,20 @@
from typing import Optional
from dataclasses import dataclass
from datetime import datetime
from bs4 import Tag
from utils.typings import BoardPath
@dataclass
class Post:
id: int
boardId: str
boardPath: BoardPath
authorId: Optional[str] = None
authorName: Optional[str] = None
category: Optional[str] = None
title: Optional[str] = None
body: Optional[Tag] = None
created_at: Optional[datetime] = None

View File

@@ -1,8 +1,16 @@
aiohappyeyeballs==2.6.1
aiohttp==3.12.15
aiosignal==1.4.0
attrs==25.3.0
beautifulsoup4==4.13.4
certifi==2025.7.14
charset-normalizer==3.4.2
frozenlist==1.7.0
idna==3.10
requests==2.32.4
lxml==6.0.0
multidict==6.6.3
propcache==0.3.2
soupsieve==2.7
typing_extensions==4.14.1
urllib3==2.5.0
yarl==1.20.1

View File

@@ -1,187 +1,22 @@
import re
import hashlib
import asyncio
from typing import Optional, Tuple, List
from dataclasses import dataclass
from datetime import datetime
from zoneinfo import ZoneInfo
from pathlib import Path
from tempfile import NamedTemporaryFile
from threading import Semaphore
from concurrent.futures import ThreadPoolExecutor, as_completed
from requests import Session
from requests.adapters import Retry
from requests.cookies import create_cookie
from bs4 import BeautifulSoup, Tag
from cgi import parse_header
from adapter import WhyTheFuckRequestsHasNoTimeoutInAdapter
from utils.middlewares import SemaphoreMiddleware
from utils.scraper import Scraper
@dataclass
class Post:
boardId: str
postId: int
authorId: Optional[str] = None
authorName: Optional[str] = None
category: Optional[str] = None
title: Optional[str] = None
body: Optional[Tag] = None
created_at: Optional[datetime] = None
async def main():
middlewares = (
SemaphoreMiddleware(5),
)
class Scraper(Session):
def __init__(self, concurrency = 5):
super().__init__()
self.semaphore = Semaphore(concurrency)
self.headers['User-Agent'] = '(Android)'
self.headers['Referer'] = 'https://m.dcinside.com/board/6974gay'
self.cookies.set_cookie(create_cookie(
name='list_count',
value='200',
domain='.dcinside.com'
))
def download_attachment(self, url: str, save_dir: Path) -> Tuple[str, Path]:
with self.semaphore:
res = self.get(url, stream=True)
res.raise_for_status()
hash = hashlib.sha1()
# fuck this shit
_, parts = parse_header(res.headers.get('Content-Disposition'))
fname = parts.get('filename', '')
fext = fname.split('.').pop()
with NamedTemporaryFile('wb', dir=save_dir) as file:
for chunk in res.iter_content(chunk_size=8192):
if chunk:
file.write(chunk)
hash.update(chunk)
return url, Path(file.name).rename(save_dir / f'{hash.hexdigest()}.{fext}')
def replace_attachment(self, post: Post, save_dir: Path):
src_to_tags = {
img.attrs['data-original'].strip(): img
for img in post.body.select('img[data-original]')
}
with ThreadPoolExecutor() as executor:
futures = [
executor.submit(self.download_attachment, src, save_dir)
for src in src_to_tags.keys()
]
for future in as_completed(futures):
src, path = future.result()
src_to_tags[src]['src'] = path
def search(self, boardId: str, type: str, offset: int, value: str) -> List[Post]:
with self.semaphore:
res = self.get(
f'https://m.dcinside.com/board/{boardId}',
params={
's_type': type,
's_pos': offset,
'serval': value
})
res.raise_for_status()
document = BeautifulSoup(res.text, 'html.parser')
return [
Post(
boardId=boardId,
postId=int(re.findall(r'/\d+', tag.attrs.get('href'))[0][1:])
)
for tag in document.select('.gall-detail-lnktb a[href]:first-child')
]
def view(self, post: Post, document_path: Path = None, attachment_path: Path = None) -> Post:
with self.semaphore:
res = self.get(f'https://m.dcinside.com/board/{post.boardId}/{post.postId}')
res.raise_for_status()
async with Scraper(middlewares=middlewares) as scraper:
posts = await scraper.list('roh', 'person')
document = BeautifulSoup(res.text, 'html.parser')
titleWrapTags = document.select('.gallview-tit-box .ginfo2 > li')
timeTag = titleWrapTags.pop()
authorTag = titleWrapTags.pop()
authorAnchorTag = authorTag.select_one('a')
titleParts = (
document
.select_one('.gallview-tit-box .tit')
.get_text(strip=True)
.split('\r\n')
)
post.title = titleParts.pop().strip()
post.category = titleParts.pop()[1:-1].strip()
if authorAnchorTag:
post.authorId = re.findall(r'\/\w+$', authorAnchorTag.attrs.get('href'))[0][1:]
post.authorName = authorAnchorTag.get_text(strip=True)
else:
authorParts = authorTag.get_text(strip=True).split('(')
post.authorId = authorParts[1][:-1].strip()
post.authorName = authorParts[0].strip()
post.created_at = (
datetime
.strptime(timeTag.get_text(strip=True), '%Y.%m.%d %H:%M')
.replace(tzinfo=ZoneInfo('Asia/Seoul'))
)
post.body = document.select_one('.thum-txtin')
# ㅋㅋㅋㅋz
if post.authorName != 'ori':
return
print(f'yoinked {post.boardId}/{post.postId}, written by {post.authorName}: {post.title}')
if attachment_path:
self.replace_attachment(post, attachment_path)
if document_path:
(document_path / f'{post.boardId}_{post.postId}.html').write_text(str(post.body))
for future in asyncio.as_completed([scraper.view(p) for p in posts]):
await future
scraper = Scraper(concurrency=5)
retries = Retry(total=5, backoff_factor=0.1, status_forcelist=[404])
scraper.mount('https://', WhyTheFuckRequestsHasNoTimeoutInAdapter(timeout=1, max_retries=retries))
document_path = Path('archives')
attachment_path = document_path / 'attachments'
attachment_path.mkdir(parents=True, exist_ok=True)
try:
for offset in range(22760681, 22760681 + (10000 * 100), 10000):
with ThreadPoolExecutor() as executor:
posts = scraper.search('aoegame', 'name', -offset, 'ori')
print(f'searching offset {-offset}, found {len(posts)} post(s)')
as_completed([
executor.submit(
scraper.view,
post,
document_path=document_path,
attachment_path=attachment_path
)
for post in scraper.search('aoegame', 'name', -offset, 'ori')
])
except KeyboardInterrupt:
print(':-)')
if __name__ == '__main__':
asyncio.run(main())

0
utils/__init__.py Normal file
View File

8
utils/middlewares.py Normal file
View File

@@ -0,0 +1,8 @@
import asyncio
from aiohttp import ClientRequest, ClientResponse, ClientHandlerType
class SemaphoreMiddleware(asyncio.Semaphore):
async def __call__(self, req: ClientRequest, handler: ClientHandlerType) -> ClientResponse:
async with self:
return await handler(req)

134
utils/scraper.py Normal file
View File

@@ -0,0 +1,134 @@
import re
from typing import Optional, List
from datetime import datetime
from zoneinfo import ZoneInfo
from aiohttp import ClientSession
from bs4 import BeautifulSoup
from .typings import BoardPath, SearchType
from models import Post
class Scraper(ClientSession):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
# 모바일 페이지 및 첨부 파일 요청 시 필요한 기본 헤더 값
self.headers['User-Agent'] = '(Android)'
self.headers['Referer'] = 'https://m.dcinside.com/board/aoegame'
# 게시글 목록 조회로 한 번에 불러올 항목 수
self.cookie_jar.update_cookies({
'list_count': '200'
})
async def __aenter__(self) -> 'Scraper':
return self
async def list(
self,
boardId: str,
boardPath: BoardPath = 'board',
page: int = 1,
categoryId: int = 0,
only_recommended: bool = False,
only_notice: bool = False,
search_type: Optional[SearchType] = None,
search_position: Optional[int] = None,
search_value: Optional[str] = None
) -> List[Post]:
"""
특정 게시판으로부터 특정 조건에 맞는 게시글 목록을 가져옵니다
:param boardId: 게시판 아이디
:param boardPath: 게시판 경로(종류)
:param page: 페이지 번호
:param categoryId: 말머리 아이디
:param only_recommended: 개념글 게시글만 조회할지?
:param only_notice: 공지 게시글만 조회할지?
:param search_type: 검색 종류
:param search_position: 검색 지점
:param search_value: 검색어
"""
url = f'https://m.dcinside.com/{boardPath}/{boardId}'
params = {
'page': page,
'headid': categoryId,
'recommend': only_recommended and '1' or '0',
'notice': only_notice and '1' or '0',
's_type': search_type or '',
's_pos': search_position or '',
'serval': search_value or ''
}
async with self.get(url, params=params) as response:
html = await response.text()
document = BeautifulSoup(html, 'lxml')
return [
Post(
id=int(re.findall(r'/\d+', tag.select_one('a[href]:first-child')['href'])[0][1:]),
boardId=boardId,
boardPath=boardPath
)
for tag in document.select('.gall-detail-lnktb')
]
async def view(self, post: Post):
"""
게시글 내용을 조회합니다
:param post: 조회할 게시글 인스턴스
"""
async with self.get(f'https://m.dcinside.com/{post.boardPath}/{post.boardId}/{post.id}') as response:
html = await response.text()
document = BeautifulSoup(html, 'lxml')
# 상단 제목 요소는 `li`로 나누어져있고 무슨 지랄을 해도 정확히 2개임
# 만약 아니라면 어처피 파싱 무결성 전체가 깨질테니 예외 처리는 나도 몰?루
authorTag, timestampTag, *_ = document.select('.gallview-tit-box .ginfo2 > li')
authorAnchorTag = authorTag.select_one('a')
# 작성일 파싱
post.created_at = (
datetime
.strptime(timestampTag.get_text(strip=True), '%Y.%m.%d %H:%M')
.replace(tzinfo=ZoneInfo('Asia/Seoul'))
)
# 작성자 정보 파싱
if authorAnchorTag:
# 작성자 요소에 앵커 태그가 있다면 갤로그가 존재하는 상태임
post.authorId = re.findall(r'\/\w+$', authorAnchorTag['href'])[0][1:]
post.authorName = authorAnchorTag.get_text(strip=True)
else:
authorParts = authorTag.get_text(strip=True).split('(')
post.authorId = authorParts.pop()[:-1].strip() # 123.123) -> 123.123
post.authorName = authorParts.pop().strip()
# 모바일 웹에서 말머리와 제목은 `\n`으로 분리되어있음
titleTexts = (
document
.select_one('.gallview-tit-box .tit')
.get_text(strip=True)
.split('\n')
)
# 제목과 말머리 파싱
post.title = titleTexts.pop().strip()
post.category = titleTexts.pop()[1:~1].strip() # [XX] -> XX
# 본문 파싱
post.body = document.select_one('.thum-txtin')
# 불필요한 본문 요소 제거
for tag in post.body.select('script, style'):
tag.extract()
print(f'{post.boardId}/{post.id}: {post.title}')

6
utils/typings.py Normal file
View File

@@ -0,0 +1,6 @@
from typing import Literal
BoardPath = Literal['board', 'mini', 'person']
SearchType = Literal['subject_m', 'subject', 'memo', 'name', 'comment']