1
0

migrate to aiohttp

This commit is contained in:
2025-08-04 09:56:02 +09:00
parent 540a84e772
commit 8ad93caa90
10 changed files with 175 additions and 189 deletions

View File

@@ -1,145 +1,22 @@
import re
import hashlib
import asyncio
from typing import Tuple, List
from datetime import datetime
from zoneinfo import ZoneInfo
from pathlib import Path
from tempfile import NamedTemporaryFile
from threading import Semaphore
from concurrent.futures import ThreadPoolExecutor, as_completed
from requests import Session
from requests.adapters import Retry
from requests.cookies import create_cookie
from bs4 import BeautifulSoup
from cgi import parse_header
from models import Post
from utils.middlewares import SemaphoreMiddleware
from utils.scraper import Scraper
class Scraper(Session):
async def main():
middlewares = (
SemaphoreMiddleware(5),
)
def __init__(self, concurrency = 5):
super().__init__()
self.semaphore = Semaphore(concurrency)
self.headers['User-Agent'] = '(Android)'
self.headers['Referer'] = 'https://m.dcinside.com/board/6974gay'
self.cookies.set_cookie(create_cookie(
name='list_count',
value='200',
domain='.dcinside.com'
))
def download_attachment(self, url: str, save_dir: Path) -> Tuple[str, Path]:
with self.semaphore:
res = self.get(url, stream=True)
res.raise_for_status()
hash = hashlib.sha1()
# fuck this shit
_, parts = parse_header(res.headers.get('Content-Disposition'))
fname = parts.get('filename', '')
fext = fname.split('.').pop()
with NamedTemporaryFile('wb', dir=save_dir) as file:
for chunk in res.iter_content(chunk_size=8192):
if chunk:
file.write(chunk)
hash.update(chunk)
return url, Path(file.name).rename(save_dir / f'{hash.hexdigest()}.{fext}')
def replace_attachment(self, post: Post, save_dir: Path):
src_to_tags = {
img.attrs['data-original'].strip(): img
for img in post.body.select('img[data-original]')
}
with ThreadPoolExecutor() as executor:
futures = [
executor.submit(self.download_attachment, src, save_dir)
for src in src_to_tags.keys()
]
for future in as_completed(futures):
src, path = future.result()
src_to_tags[src]['src'] = path
def search(self, boardId: str, type: str, offset: int, value: str) -> List[Post]:
with self.semaphore:
res = self.get(
f'https://m.dcinside.com/board/{boardId}',
params={
's_type': type,
's_pos': offset,
'serval': value
})
res.raise_for_status()
document = BeautifulSoup(res.text, 'html.parser')
return [
Post(
boardId=boardId,
postId=int(re.findall(r'/\d+', tag.attrs.get('href'))[0][1:])
)
for tag in document.select('.gall-detail-lnktb a[href]:first-child')
]
def view(self, post: Post, document_path: Path = None, attachment_path: Path = None) -> Post:
with self.semaphore:
res = self.get(f'https://m.dcinside.com/board/{post.boardId}/{post.postId}')
res.raise_for_status()
async with Scraper(middlewares=middlewares) as scraper:
posts = await scraper.list('roh', 'person')
document = BeautifulSoup(res.text, 'html.parser')
for future in asyncio.as_completed([scraper.view(p) for p in posts]):
await future
titleWrapTags = document.select('.gallview-tit-box .ginfo2 > li')
timeTag = titleWrapTags.pop()
authorTag = titleWrapTags.pop()
authorAnchorTag = authorTag.select_one('a')
titleParts = (
document
.select_one('.gallview-tit-box .tit')
.get_text(strip=True)
.split('\r\n')
)
post.title = titleParts.pop().strip()
post.category = titleParts.pop()[1:-1].strip()
if authorAnchorTag:
post.authorId = re.findall(r'\/\w+$', authorAnchorTag.attrs.get('href'))[0][1:]
post.authorName = authorAnchorTag.get_text(strip=True)
else:
authorParts = authorTag.get_text(strip=True).split('(')
post.authorId = authorParts[1][:-1].strip()
post.authorName = authorParts[0].strip()
post.created_at = (
datetime
.strptime(timeTag.get_text(strip=True), '%Y.%m.%d %H:%M')
.replace(tzinfo=ZoneInfo('Asia/Seoul'))
)
post.body = document.select_one('.thum-txtin')
# ㅋㅋㅋㅋz
if post.authorName != 'ori':
return
print(f'yoinked {post.boardId}/{post.postId}, written by {post.authorName}: {post.title}')
if attachment_path:
self.replace_attachment(post, attachment_path)
if document_path:
(document_path / f'{post.boardId}_{post.postId}.html').write_text(str(post.body))
if __name__ == '__main__':
asyncio.run(main())