migrate to aiohttp
This commit is contained in:
14
adapter.py
14
adapter.py
@@ -1,14 +0,0 @@
|
|||||||
from requests.adapters import HTTPAdapter
|
|
||||||
|
|
||||||
class TimeoutHTTPAdapter(HTTPAdapter):
|
|
||||||
def __init__(self, *args, **kwargs):
|
|
||||||
if "timeout" in kwargs:
|
|
||||||
self.timeout = kwargs["timeout"]
|
|
||||||
del kwargs["timeout"]
|
|
||||||
super().__init__(*args, **kwargs)
|
|
||||||
|
|
||||||
def send(self, request, **kwargs):
|
|
||||||
timeout = kwargs.get("timeout")
|
|
||||||
if timeout is None and hasattr(self, 'timeout'):
|
|
||||||
kwargs["timeout"] = self.timeout
|
|
||||||
return super().send(request, **kwargs)
|
|
36
main.py
36
main.py
@@ -1,36 +0,0 @@
|
|||||||
from pathlib import Path
|
|
||||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
||||||
|
|
||||||
from requests.adapters import Retry
|
|
||||||
|
|
||||||
from adapter import TimeoutHTTPAdapter
|
|
||||||
from scraper import Scraper
|
|
||||||
|
|
||||||
|
|
||||||
scraper = Scraper(concurrency=5)
|
|
||||||
|
|
||||||
retries = Retry(total=5, backoff_factor=0.1, status_forcelist=[404])
|
|
||||||
scraper.mount('https://', TimeoutHTTPAdapter(timeout=1, max_retries=retries))
|
|
||||||
|
|
||||||
document_path = Path('archives')
|
|
||||||
attachment_path = document_path / 'attachments'
|
|
||||||
attachment_path.mkdir(parents=True, exist_ok=True)
|
|
||||||
|
|
||||||
try:
|
|
||||||
for offset in range(22760681, 22760681 + (10000 * 100), 10000):
|
|
||||||
with ThreadPoolExecutor() as executor:
|
|
||||||
posts = scraper.search('aoegame', 'name', -offset, 'ori')
|
|
||||||
print(f'searching offset {-offset}, found {len(posts)} post(s)')
|
|
||||||
as_completed([
|
|
||||||
executor.submit(
|
|
||||||
scraper.view,
|
|
||||||
post,
|
|
||||||
document_path=document_path,
|
|
||||||
attachment_path=attachment_path
|
|
||||||
)
|
|
||||||
for post in scraper.search('aoegame', 'name', -offset, 'ori')
|
|
||||||
])
|
|
||||||
|
|
||||||
|
|
||||||
except KeyboardInterrupt:
|
|
||||||
print(':-)')
|
|
@@ -1 +1 @@
|
|||||||
from .post import *
|
from .post import Post
|
||||||
|
@@ -4,11 +4,14 @@ from datetime import datetime
|
|||||||
|
|
||||||
from bs4 import Tag
|
from bs4 import Tag
|
||||||
|
|
||||||
|
from utils.typings import BoardPath
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class Post:
|
class Post:
|
||||||
|
id: int
|
||||||
boardId: str
|
boardId: str
|
||||||
postId: int
|
boardPath: BoardPath
|
||||||
authorId: Optional[str] = None
|
authorId: Optional[str] = None
|
||||||
authorName: Optional[str] = None
|
authorName: Optional[str] = None
|
||||||
category: Optional[str] = None
|
category: Optional[str] = None
|
||||||
|
@@ -1,8 +1,16 @@
|
|||||||
|
aiohappyeyeballs==2.6.1
|
||||||
|
aiohttp==3.12.15
|
||||||
|
aiosignal==1.4.0
|
||||||
|
attrs==25.3.0
|
||||||
beautifulsoup4==4.13.4
|
beautifulsoup4==4.13.4
|
||||||
certifi==2025.7.14
|
certifi==2025.7.14
|
||||||
charset-normalizer==3.4.2
|
charset-normalizer==3.4.2
|
||||||
|
frozenlist==1.7.0
|
||||||
idna==3.10
|
idna==3.10
|
||||||
requests==2.32.4
|
lxml==6.0.0
|
||||||
|
multidict==6.6.3
|
||||||
|
propcache==0.3.2
|
||||||
soupsieve==2.7
|
soupsieve==2.7
|
||||||
typing_extensions==4.14.1
|
typing_extensions==4.14.1
|
||||||
urllib3==2.5.0
|
urllib3==2.5.0
|
||||||
|
yarl==1.20.1
|
||||||
|
147
scraper.py
147
scraper.py
@@ -1,145 +1,22 @@
|
|||||||
import re
|
import asyncio
|
||||||
import hashlib
|
|
||||||
|
|
||||||
from typing import Tuple, List
|
|
||||||
from datetime import datetime
|
|
||||||
from zoneinfo import ZoneInfo
|
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from tempfile import NamedTemporaryFile
|
|
||||||
|
|
||||||
from threading import Semaphore
|
from utils.middlewares import SemaphoreMiddleware
|
||||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
from utils.scraper import Scraper
|
||||||
|
|
||||||
from requests import Session
|
|
||||||
from requests.adapters import Retry
|
|
||||||
from requests.cookies import create_cookie
|
|
||||||
|
|
||||||
from bs4 import BeautifulSoup
|
|
||||||
|
|
||||||
from cgi import parse_header
|
|
||||||
from models import Post
|
|
||||||
|
|
||||||
|
|
||||||
class Scraper(Session):
|
async def main():
|
||||||
|
middlewares = (
|
||||||
def __init__(self, concurrency = 5):
|
SemaphoreMiddleware(5),
|
||||||
super().__init__()
|
|
||||||
|
|
||||||
self.semaphore = Semaphore(concurrency)
|
|
||||||
|
|
||||||
self.headers['User-Agent'] = '(Android)'
|
|
||||||
self.headers['Referer'] = 'https://m.dcinside.com/board/6974gay'
|
|
||||||
|
|
||||||
self.cookies.set_cookie(create_cookie(
|
|
||||||
name='list_count',
|
|
||||||
value='200',
|
|
||||||
domain='.dcinside.com'
|
|
||||||
))
|
|
||||||
|
|
||||||
def download_attachment(self, url: str, save_dir: Path) -> Tuple[str, Path]:
|
|
||||||
with self.semaphore:
|
|
||||||
res = self.get(url, stream=True)
|
|
||||||
res.raise_for_status()
|
|
||||||
|
|
||||||
hash = hashlib.sha1()
|
|
||||||
|
|
||||||
# fuck this shit
|
|
||||||
_, parts = parse_header(res.headers.get('Content-Disposition'))
|
|
||||||
fname = parts.get('filename', '')
|
|
||||||
fext = fname.split('.').pop()
|
|
||||||
|
|
||||||
with NamedTemporaryFile('wb', dir=save_dir) as file:
|
|
||||||
for chunk in res.iter_content(chunk_size=8192):
|
|
||||||
if chunk:
|
|
||||||
file.write(chunk)
|
|
||||||
hash.update(chunk)
|
|
||||||
|
|
||||||
return url, Path(file.name).rename(save_dir / f'{hash.hexdigest()}.{fext}')
|
|
||||||
|
|
||||||
|
|
||||||
def replace_attachment(self, post: Post, save_dir: Path):
|
|
||||||
src_to_tags = {
|
|
||||||
img.attrs['data-original'].strip(): img
|
|
||||||
for img in post.body.select('img[data-original]')
|
|
||||||
}
|
|
||||||
|
|
||||||
with ThreadPoolExecutor() as executor:
|
|
||||||
futures = [
|
|
||||||
executor.submit(self.download_attachment, src, save_dir)
|
|
||||||
for src in src_to_tags.keys()
|
|
||||||
]
|
|
||||||
|
|
||||||
for future in as_completed(futures):
|
|
||||||
src, path = future.result()
|
|
||||||
src_to_tags[src]['src'] = path
|
|
||||||
|
|
||||||
def search(self, boardId: str, type: str, offset: int, value: str) -> List[Post]:
|
|
||||||
with self.semaphore:
|
|
||||||
res = self.get(
|
|
||||||
f'https://m.dcinside.com/board/{boardId}',
|
|
||||||
params={
|
|
||||||
's_type': type,
|
|
||||||
's_pos': offset,
|
|
||||||
'serval': value
|
|
||||||
})
|
|
||||||
res.raise_for_status()
|
|
||||||
|
|
||||||
document = BeautifulSoup(res.text, 'html.parser')
|
|
||||||
return [
|
|
||||||
Post(
|
|
||||||
boardId=boardId,
|
|
||||||
postId=int(re.findall(r'/\d+', tag.attrs.get('href'))[0][1:])
|
|
||||||
)
|
|
||||||
for tag in document.select('.gall-detail-lnktb a[href]:first-child')
|
|
||||||
]
|
|
||||||
|
|
||||||
def view(self, post: Post, document_path: Path = None, attachment_path: Path = None) -> Post:
|
|
||||||
with self.semaphore:
|
|
||||||
res = self.get(f'https://m.dcinside.com/board/{post.boardId}/{post.postId}')
|
|
||||||
res.raise_for_status()
|
|
||||||
|
|
||||||
document = BeautifulSoup(res.text, 'html.parser')
|
|
||||||
|
|
||||||
titleWrapTags = document.select('.gallview-tit-box .ginfo2 > li')
|
|
||||||
|
|
||||||
timeTag = titleWrapTags.pop()
|
|
||||||
authorTag = titleWrapTags.pop()
|
|
||||||
authorAnchorTag = authorTag.select_one('a')
|
|
||||||
|
|
||||||
titleParts = (
|
|
||||||
document
|
|
||||||
.select_one('.gallview-tit-box .tit')
|
|
||||||
.get_text(strip=True)
|
|
||||||
.split('\r\n')
|
|
||||||
)
|
)
|
||||||
|
|
||||||
post.title = titleParts.pop().strip()
|
async with Scraper(middlewares=middlewares) as scraper:
|
||||||
post.category = titleParts.pop()[1:-1].strip()
|
posts = await scraper.list('roh', 'person')
|
||||||
|
|
||||||
if authorAnchorTag:
|
for future in asyncio.as_completed([scraper.view(p) for p in posts]):
|
||||||
post.authorId = re.findall(r'\/\w+$', authorAnchorTag.attrs.get('href'))[0][1:]
|
await future
|
||||||
post.authorName = authorAnchorTag.get_text(strip=True)
|
|
||||||
else:
|
|
||||||
authorParts = authorTag.get_text(strip=True).split('(')
|
|
||||||
post.authorId = authorParts[1][:-1].strip()
|
|
||||||
post.authorName = authorParts[0].strip()
|
|
||||||
|
|
||||||
post.created_at = (
|
|
||||||
datetime
|
|
||||||
.strptime(timeTag.get_text(strip=True), '%Y.%m.%d %H:%M')
|
|
||||||
.replace(tzinfo=ZoneInfo('Asia/Seoul'))
|
|
||||||
)
|
|
||||||
|
|
||||||
post.body = document.select_one('.thum-txtin')
|
if __name__ == '__main__':
|
||||||
|
asyncio.run(main())
|
||||||
# ㅋㅋㅋㅋz
|
|
||||||
if post.authorName != 'ori':
|
|
||||||
return
|
|
||||||
|
|
||||||
print(f'yoinked {post.boardId}/{post.postId}, written by {post.authorName}: {post.title}')
|
|
||||||
|
|
||||||
if attachment_path:
|
|
||||||
self.replace_attachment(post, attachment_path)
|
|
||||||
|
|
||||||
if document_path:
|
|
||||||
(document_path / f'{post.boardId}_{post.postId}.html').write_text(str(post.body))
|
|
||||||
|
0
utils/__init__.py
Normal file
0
utils/__init__.py
Normal file
8
utils/middlewares.py
Normal file
8
utils/middlewares.py
Normal file
@@ -0,0 +1,8 @@
|
|||||||
|
import asyncio
|
||||||
|
|
||||||
|
from aiohttp import ClientRequest, ClientResponse, ClientHandlerType
|
||||||
|
|
||||||
|
class SemaphoreMiddleware(asyncio.Semaphore):
|
||||||
|
async def __call__(self, req: ClientRequest, handler: ClientHandlerType) -> ClientResponse:
|
||||||
|
async with self:
|
||||||
|
return await handler(req)
|
134
utils/scraper.py
Normal file
134
utils/scraper.py
Normal file
@@ -0,0 +1,134 @@
|
|||||||
|
import re
|
||||||
|
from typing import Optional, List
|
||||||
|
from datetime import datetime
|
||||||
|
from zoneinfo import ZoneInfo
|
||||||
|
|
||||||
|
from aiohttp import ClientSession
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
|
from .typings import BoardPath, SearchType
|
||||||
|
from models import Post
|
||||||
|
|
||||||
|
|
||||||
|
class Scraper(ClientSession):
|
||||||
|
|
||||||
|
def __init__(self, *args, **kwargs):
|
||||||
|
super().__init__(*args, **kwargs)
|
||||||
|
|
||||||
|
# 모바일 페이지 및 첨부 파일 요청 시 필요한 기본 헤더 값
|
||||||
|
self.headers['User-Agent'] = '(Android)'
|
||||||
|
self.headers['Referer'] = 'https://m.dcinside.com/board/aoegame'
|
||||||
|
|
||||||
|
# 게시글 목록 조회로 한 번에 불러올 항목 수
|
||||||
|
self.cookie_jar.update_cookies({
|
||||||
|
'list_count': '200'
|
||||||
|
})
|
||||||
|
|
||||||
|
async def __aenter__(self) -> 'Scraper':
|
||||||
|
return self
|
||||||
|
|
||||||
|
|
||||||
|
async def list(
|
||||||
|
self,
|
||||||
|
boardId: str,
|
||||||
|
boardPath: BoardPath = 'board',
|
||||||
|
page: int = 1,
|
||||||
|
categoryId: int = 0,
|
||||||
|
only_recommended: bool = False,
|
||||||
|
only_notice: bool = False,
|
||||||
|
search_type: Optional[SearchType] = None,
|
||||||
|
search_position: Optional[int] = None,
|
||||||
|
search_value: Optional[str] = None
|
||||||
|
) -> List[Post]:
|
||||||
|
"""
|
||||||
|
특정 게시판으로부터 특정 조건에 맞는 게시글 목록을 가져옵니다
|
||||||
|
|
||||||
|
:param boardId: 게시판 아이디
|
||||||
|
:param boardPath: 게시판 경로(종류)
|
||||||
|
:param page: 페이지 번호
|
||||||
|
:param categoryId: 말머리 아이디
|
||||||
|
:param only_recommended: 개념글 게시글만 조회할지?
|
||||||
|
:param only_notice: 공지 게시글만 조회할지?
|
||||||
|
:param search_type: 검색 종류
|
||||||
|
:param search_position: 검색 지점
|
||||||
|
:param search_value: 검색어
|
||||||
|
"""
|
||||||
|
|
||||||
|
url = f'https://m.dcinside.com/{boardPath}/{boardId}'
|
||||||
|
params = {
|
||||||
|
'page': page,
|
||||||
|
'headid': categoryId,
|
||||||
|
'recommend': only_recommended and '1' or '0',
|
||||||
|
'notice': only_notice and '1' or '0',
|
||||||
|
's_type': search_type or '',
|
||||||
|
's_pos': search_position or '',
|
||||||
|
'serval': search_value or ''
|
||||||
|
}
|
||||||
|
|
||||||
|
async with self.get(url, params=params) as response:
|
||||||
|
html = await response.text()
|
||||||
|
document = BeautifulSoup(html, 'lxml')
|
||||||
|
|
||||||
|
return [
|
||||||
|
Post(
|
||||||
|
id=int(re.findall(r'/\d+', tag.select_one('a[href]:first-child')['href'])[0][1:]),
|
||||||
|
boardId=boardId,
|
||||||
|
boardPath=boardPath
|
||||||
|
)
|
||||||
|
for tag in document.select('.gall-detail-lnktb')
|
||||||
|
]
|
||||||
|
|
||||||
|
async def view(self, post: Post):
|
||||||
|
"""
|
||||||
|
게시글 내용을 조회합니다
|
||||||
|
|
||||||
|
:param post: 조회할 게시글 인스턴스
|
||||||
|
"""
|
||||||
|
|
||||||
|
async with self.get(f'https://m.dcinside.com/{post.boardPath}/{post.boardId}/{post.id}') as response:
|
||||||
|
html = await response.text()
|
||||||
|
document = BeautifulSoup(html, 'lxml')
|
||||||
|
|
||||||
|
# 상단 제목 요소는 `li`로 나누어져있고 무슨 지랄을 해도 정확히 2개임
|
||||||
|
# 만약 아니라면 어처피 파싱 무결성 전체가 깨질테니 예외 처리는 나도 몰?루
|
||||||
|
authorTag, timestampTag, *_ = document.select('.gallview-tit-box .ginfo2 > li')
|
||||||
|
authorAnchorTag = authorTag.select_one('a')
|
||||||
|
|
||||||
|
# 작성일 파싱
|
||||||
|
post.created_at = (
|
||||||
|
datetime
|
||||||
|
.strptime(timestampTag.get_text(strip=True), '%Y.%m.%d %H:%M')
|
||||||
|
.replace(tzinfo=ZoneInfo('Asia/Seoul'))
|
||||||
|
|
||||||
|
)
|
||||||
|
|
||||||
|
# 작성자 정보 파싱
|
||||||
|
if authorAnchorTag:
|
||||||
|
# 작성자 요소에 앵커 태그가 있다면 갤로그가 존재하는 상태임
|
||||||
|
post.authorId = re.findall(r'\/\w+$', authorAnchorTag['href'])[0][1:]
|
||||||
|
post.authorName = authorAnchorTag.get_text(strip=True)
|
||||||
|
else:
|
||||||
|
authorParts = authorTag.get_text(strip=True).split('(')
|
||||||
|
post.authorId = authorParts.pop()[:-1].strip() # 123.123) -> 123.123
|
||||||
|
post.authorName = authorParts.pop().strip()
|
||||||
|
|
||||||
|
# 모바일 웹에서 말머리와 제목은 `\n`으로 분리되어있음
|
||||||
|
titleTexts = (
|
||||||
|
document
|
||||||
|
.select_one('.gallview-tit-box .tit')
|
||||||
|
.get_text(strip=True)
|
||||||
|
.split('\n')
|
||||||
|
)
|
||||||
|
|
||||||
|
# 제목과 말머리 파싱
|
||||||
|
post.title = titleTexts.pop().strip()
|
||||||
|
post.category = titleTexts.pop()[1:~1].strip() # [XX] -> XX
|
||||||
|
|
||||||
|
# 본문 파싱
|
||||||
|
post.body = document.select_one('.thum-txtin')
|
||||||
|
|
||||||
|
# 불필요한 본문 요소 제거
|
||||||
|
for tag in post.body.select('script, style'):
|
||||||
|
tag.extract()
|
||||||
|
|
||||||
|
print(f'{post.boardId}/{post.id}: {post.title}')
|
6
utils/typings.py
Normal file
6
utils/typings.py
Normal file
@@ -0,0 +1,6 @@
|
|||||||
|
from typing import Literal
|
||||||
|
|
||||||
|
|
||||||
|
BoardPath = Literal['board', 'mini', 'person']
|
||||||
|
|
||||||
|
SearchType = Literal['subject_m', 'subject', 'memo', 'name', 'comment']
|
Reference in New Issue
Block a user