1
0

migrate to aiohttp

This commit is contained in:
2025-08-04 09:56:02 +09:00
parent 540a84e772
commit 8ad93caa90
10 changed files with 175 additions and 189 deletions

View File

@@ -1,14 +0,0 @@
from requests.adapters import HTTPAdapter
class TimeoutHTTPAdapter(HTTPAdapter):
def __init__(self, *args, **kwargs):
if "timeout" in kwargs:
self.timeout = kwargs["timeout"]
del kwargs["timeout"]
super().__init__(*args, **kwargs)
def send(self, request, **kwargs):
timeout = kwargs.get("timeout")
if timeout is None and hasattr(self, 'timeout'):
kwargs["timeout"] = self.timeout
return super().send(request, **kwargs)

36
main.py
View File

@@ -1,36 +0,0 @@
from pathlib import Path
from concurrent.futures import ThreadPoolExecutor, as_completed
from requests.adapters import Retry
from adapter import TimeoutHTTPAdapter
from scraper import Scraper
scraper = Scraper(concurrency=5)
retries = Retry(total=5, backoff_factor=0.1, status_forcelist=[404])
scraper.mount('https://', TimeoutHTTPAdapter(timeout=1, max_retries=retries))
document_path = Path('archives')
attachment_path = document_path / 'attachments'
attachment_path.mkdir(parents=True, exist_ok=True)
try:
for offset in range(22760681, 22760681 + (10000 * 100), 10000):
with ThreadPoolExecutor() as executor:
posts = scraper.search('aoegame', 'name', -offset, 'ori')
print(f'searching offset {-offset}, found {len(posts)} post(s)')
as_completed([
executor.submit(
scraper.view,
post,
document_path=document_path,
attachment_path=attachment_path
)
for post in scraper.search('aoegame', 'name', -offset, 'ori')
])
except KeyboardInterrupt:
print(':-)')

View File

@@ -1 +1 @@
from .post import * from .post import Post

View File

@@ -4,11 +4,14 @@ from datetime import datetime
from bs4 import Tag from bs4 import Tag
from utils.typings import BoardPath
@dataclass @dataclass
class Post: class Post:
id: int
boardId: str boardId: str
postId: int boardPath: BoardPath
authorId: Optional[str] = None authorId: Optional[str] = None
authorName: Optional[str] = None authorName: Optional[str] = None
category: Optional[str] = None category: Optional[str] = None

View File

@@ -1,8 +1,16 @@
aiohappyeyeballs==2.6.1
aiohttp==3.12.15
aiosignal==1.4.0
attrs==25.3.0
beautifulsoup4==4.13.4 beautifulsoup4==4.13.4
certifi==2025.7.14 certifi==2025.7.14
charset-normalizer==3.4.2 charset-normalizer==3.4.2
frozenlist==1.7.0
idna==3.10 idna==3.10
requests==2.32.4 lxml==6.0.0
multidict==6.6.3
propcache==0.3.2
soupsieve==2.7 soupsieve==2.7
typing_extensions==4.14.1 typing_extensions==4.14.1
urllib3==2.5.0 urllib3==2.5.0
yarl==1.20.1

View File

@@ -1,145 +1,22 @@
import re import asyncio
import hashlib
from typing import Tuple, List
from datetime import datetime
from zoneinfo import ZoneInfo
from pathlib import Path from pathlib import Path
from tempfile import NamedTemporaryFile
from threading import Semaphore from utils.middlewares import SemaphoreMiddleware
from concurrent.futures import ThreadPoolExecutor, as_completed from utils.scraper import Scraper
from requests import Session
from requests.adapters import Retry
from requests.cookies import create_cookie
from bs4 import BeautifulSoup
from cgi import parse_header
from models import Post
class Scraper(Session): async def main():
middlewares = (
SemaphoreMiddleware(5),
)
def __init__(self, concurrency = 5): async with Scraper(middlewares=middlewares) as scraper:
super().__init__() posts = await scraper.list('roh', 'person')
self.semaphore = Semaphore(concurrency)
self.headers['User-Agent'] = '(Android)'
self.headers['Referer'] = 'https://m.dcinside.com/board/6974gay'
self.cookies.set_cookie(create_cookie(
name='list_count',
value='200',
domain='.dcinside.com'
))
def download_attachment(self, url: str, save_dir: Path) -> Tuple[str, Path]:
with self.semaphore:
res = self.get(url, stream=True)
res.raise_for_status()
hash = hashlib.sha1()
# fuck this shit
_, parts = parse_header(res.headers.get('Content-Disposition'))
fname = parts.get('filename', '')
fext = fname.split('.').pop()
with NamedTemporaryFile('wb', dir=save_dir) as file:
for chunk in res.iter_content(chunk_size=8192):
if chunk:
file.write(chunk)
hash.update(chunk)
return url, Path(file.name).rename(save_dir / f'{hash.hexdigest()}.{fext}')
def replace_attachment(self, post: Post, save_dir: Path):
src_to_tags = {
img.attrs['data-original'].strip(): img
for img in post.body.select('img[data-original]')
}
with ThreadPoolExecutor() as executor:
futures = [
executor.submit(self.download_attachment, src, save_dir)
for src in src_to_tags.keys()
]
for future in as_completed(futures):
src, path = future.result()
src_to_tags[src]['src'] = path
def search(self, boardId: str, type: str, offset: int, value: str) -> List[Post]:
with self.semaphore:
res = self.get(
f'https://m.dcinside.com/board/{boardId}',
params={
's_type': type,
's_pos': offset,
'serval': value
})
res.raise_for_status()
document = BeautifulSoup(res.text, 'html.parser')
return [
Post(
boardId=boardId,
postId=int(re.findall(r'/\d+', tag.attrs.get('href'))[0][1:])
)
for tag in document.select('.gall-detail-lnktb a[href]:first-child')
]
def view(self, post: Post, document_path: Path = None, attachment_path: Path = None) -> Post:
with self.semaphore:
res = self.get(f'https://m.dcinside.com/board/{post.boardId}/{post.postId}')
res.raise_for_status()
document = BeautifulSoup(res.text, 'html.parser') for future in asyncio.as_completed([scraper.view(p) for p in posts]):
await future
titleWrapTags = document.select('.gallview-tit-box .ginfo2 > li')
timeTag = titleWrapTags.pop() if __name__ == '__main__':
authorTag = titleWrapTags.pop() asyncio.run(main())
authorAnchorTag = authorTag.select_one('a')
titleParts = (
document
.select_one('.gallview-tit-box .tit')
.get_text(strip=True)
.split('\r\n')
)
post.title = titleParts.pop().strip()
post.category = titleParts.pop()[1:-1].strip()
if authorAnchorTag:
post.authorId = re.findall(r'\/\w+$', authorAnchorTag.attrs.get('href'))[0][1:]
post.authorName = authorAnchorTag.get_text(strip=True)
else:
authorParts = authorTag.get_text(strip=True).split('(')
post.authorId = authorParts[1][:-1].strip()
post.authorName = authorParts[0].strip()
post.created_at = (
datetime
.strptime(timeTag.get_text(strip=True), '%Y.%m.%d %H:%M')
.replace(tzinfo=ZoneInfo('Asia/Seoul'))
)
post.body = document.select_one('.thum-txtin')
# ㅋㅋㅋㅋz
if post.authorName != 'ori':
return
print(f'yoinked {post.boardId}/{post.postId}, written by {post.authorName}: {post.title}')
if attachment_path:
self.replace_attachment(post, attachment_path)
if document_path:
(document_path / f'{post.boardId}_{post.postId}.html').write_text(str(post.body))

0
utils/__init__.py Normal file
View File

8
utils/middlewares.py Normal file
View File

@@ -0,0 +1,8 @@
import asyncio
from aiohttp import ClientRequest, ClientResponse, ClientHandlerType
class SemaphoreMiddleware(asyncio.Semaphore):
async def __call__(self, req: ClientRequest, handler: ClientHandlerType) -> ClientResponse:
async with self:
return await handler(req)

134
utils/scraper.py Normal file
View File

@@ -0,0 +1,134 @@
import re
from typing import Optional, List
from datetime import datetime
from zoneinfo import ZoneInfo
from aiohttp import ClientSession
from bs4 import BeautifulSoup
from .typings import BoardPath, SearchType
from models import Post
class Scraper(ClientSession):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
# 모바일 페이지 및 첨부 파일 요청 시 필요한 기본 헤더 값
self.headers['User-Agent'] = '(Android)'
self.headers['Referer'] = 'https://m.dcinside.com/board/aoegame'
# 게시글 목록 조회로 한 번에 불러올 항목 수
self.cookie_jar.update_cookies({
'list_count': '200'
})
async def __aenter__(self) -> 'Scraper':
return self
async def list(
self,
boardId: str,
boardPath: BoardPath = 'board',
page: int = 1,
categoryId: int = 0,
only_recommended: bool = False,
only_notice: bool = False,
search_type: Optional[SearchType] = None,
search_position: Optional[int] = None,
search_value: Optional[str] = None
) -> List[Post]:
"""
특정 게시판으로부터 특정 조건에 맞는 게시글 목록을 가져옵니다
:param boardId: 게시판 아이디
:param boardPath: 게시판 경로(종류)
:param page: 페이지 번호
:param categoryId: 말머리 아이디
:param only_recommended: 개념글 게시글만 조회할지?
:param only_notice: 공지 게시글만 조회할지?
:param search_type: 검색 종류
:param search_position: 검색 지점
:param search_value: 검색어
"""
url = f'https://m.dcinside.com/{boardPath}/{boardId}'
params = {
'page': page,
'headid': categoryId,
'recommend': only_recommended and '1' or '0',
'notice': only_notice and '1' or '0',
's_type': search_type or '',
's_pos': search_position or '',
'serval': search_value or ''
}
async with self.get(url, params=params) as response:
html = await response.text()
document = BeautifulSoup(html, 'lxml')
return [
Post(
id=int(re.findall(r'/\d+', tag.select_one('a[href]:first-child')['href'])[0][1:]),
boardId=boardId,
boardPath=boardPath
)
for tag in document.select('.gall-detail-lnktb')
]
async def view(self, post: Post):
"""
게시글 내용을 조회합니다
:param post: 조회할 게시글 인스턴스
"""
async with self.get(f'https://m.dcinside.com/{post.boardPath}/{post.boardId}/{post.id}') as response:
html = await response.text()
document = BeautifulSoup(html, 'lxml')
# 상단 제목 요소는 `li`로 나누어져있고 무슨 지랄을 해도 정확히 2개임
# 만약 아니라면 어처피 파싱 무결성 전체가 깨질테니 예외 처리는 나도 몰?루
authorTag, timestampTag, *_ = document.select('.gallview-tit-box .ginfo2 > li')
authorAnchorTag = authorTag.select_one('a')
# 작성일 파싱
post.created_at = (
datetime
.strptime(timestampTag.get_text(strip=True), '%Y.%m.%d %H:%M')
.replace(tzinfo=ZoneInfo('Asia/Seoul'))
)
# 작성자 정보 파싱
if authorAnchorTag:
# 작성자 요소에 앵커 태그가 있다면 갤로그가 존재하는 상태임
post.authorId = re.findall(r'\/\w+$', authorAnchorTag['href'])[0][1:]
post.authorName = authorAnchorTag.get_text(strip=True)
else:
authorParts = authorTag.get_text(strip=True).split('(')
post.authorId = authorParts.pop()[:-1].strip() # 123.123) -> 123.123
post.authorName = authorParts.pop().strip()
# 모바일 웹에서 말머리와 제목은 `\n`으로 분리되어있음
titleTexts = (
document
.select_one('.gallview-tit-box .tit')
.get_text(strip=True)
.split('\n')
)
# 제목과 말머리 파싱
post.title = titleTexts.pop().strip()
post.category = titleTexts.pop()[1:~1].strip() # [XX] -> XX
# 본문 파싱
post.body = document.select_one('.thum-txtin')
# 불필요한 본문 요소 제거
for tag in post.body.select('script, style'):
tag.extract()
print(f'{post.boardId}/{post.id}: {post.title}')

6
utils/typings.py Normal file
View File

@@ -0,0 +1,6 @@
from typing import Literal
BoardPath = Literal['board', 'mini', 'person']
SearchType = Literal['subject_m', 'subject', 'memo', 'name', 'comment']