migrate to aiohttp
This commit is contained in:
149
scraper.py
149
scraper.py
@@ -1,145 +1,22 @@
|
||||
import re
|
||||
import hashlib
|
||||
import asyncio
|
||||
|
||||
from typing import Tuple, List
|
||||
from datetime import datetime
|
||||
from zoneinfo import ZoneInfo
|
||||
from pathlib import Path
|
||||
from tempfile import NamedTemporaryFile
|
||||
|
||||
from threading import Semaphore
|
||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||
|
||||
from requests import Session
|
||||
from requests.adapters import Retry
|
||||
from requests.cookies import create_cookie
|
||||
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
from cgi import parse_header
|
||||
from models import Post
|
||||
from utils.middlewares import SemaphoreMiddleware
|
||||
from utils.scraper import Scraper
|
||||
|
||||
|
||||
class Scraper(Session):
|
||||
async def main():
|
||||
middlewares = (
|
||||
SemaphoreMiddleware(5),
|
||||
)
|
||||
|
||||
def __init__(self, concurrency = 5):
|
||||
super().__init__()
|
||||
|
||||
self.semaphore = Semaphore(concurrency)
|
||||
|
||||
self.headers['User-Agent'] = '(Android)'
|
||||
self.headers['Referer'] = 'https://m.dcinside.com/board/6974gay'
|
||||
|
||||
self.cookies.set_cookie(create_cookie(
|
||||
name='list_count',
|
||||
value='200',
|
||||
domain='.dcinside.com'
|
||||
))
|
||||
|
||||
def download_attachment(self, url: str, save_dir: Path) -> Tuple[str, Path]:
|
||||
with self.semaphore:
|
||||
res = self.get(url, stream=True)
|
||||
res.raise_for_status()
|
||||
|
||||
hash = hashlib.sha1()
|
||||
|
||||
# fuck this shit
|
||||
_, parts = parse_header(res.headers.get('Content-Disposition'))
|
||||
fname = parts.get('filename', '')
|
||||
fext = fname.split('.').pop()
|
||||
|
||||
with NamedTemporaryFile('wb', dir=save_dir) as file:
|
||||
for chunk in res.iter_content(chunk_size=8192):
|
||||
if chunk:
|
||||
file.write(chunk)
|
||||
hash.update(chunk)
|
||||
|
||||
return url, Path(file.name).rename(save_dir / f'{hash.hexdigest()}.{fext}')
|
||||
|
||||
|
||||
def replace_attachment(self, post: Post, save_dir: Path):
|
||||
src_to_tags = {
|
||||
img.attrs['data-original'].strip(): img
|
||||
for img in post.body.select('img[data-original]')
|
||||
}
|
||||
|
||||
with ThreadPoolExecutor() as executor:
|
||||
futures = [
|
||||
executor.submit(self.download_attachment, src, save_dir)
|
||||
for src in src_to_tags.keys()
|
||||
]
|
||||
|
||||
for future in as_completed(futures):
|
||||
src, path = future.result()
|
||||
src_to_tags[src]['src'] = path
|
||||
|
||||
def search(self, boardId: str, type: str, offset: int, value: str) -> List[Post]:
|
||||
with self.semaphore:
|
||||
res = self.get(
|
||||
f'https://m.dcinside.com/board/{boardId}',
|
||||
params={
|
||||
's_type': type,
|
||||
's_pos': offset,
|
||||
'serval': value
|
||||
})
|
||||
res.raise_for_status()
|
||||
|
||||
document = BeautifulSoup(res.text, 'html.parser')
|
||||
return [
|
||||
Post(
|
||||
boardId=boardId,
|
||||
postId=int(re.findall(r'/\d+', tag.attrs.get('href'))[0][1:])
|
||||
)
|
||||
for tag in document.select('.gall-detail-lnktb a[href]:first-child')
|
||||
]
|
||||
|
||||
def view(self, post: Post, document_path: Path = None, attachment_path: Path = None) -> Post:
|
||||
with self.semaphore:
|
||||
res = self.get(f'https://m.dcinside.com/board/{post.boardId}/{post.postId}')
|
||||
res.raise_for_status()
|
||||
async with Scraper(middlewares=middlewares) as scraper:
|
||||
posts = await scraper.list('roh', 'person')
|
||||
|
||||
document = BeautifulSoup(res.text, 'html.parser')
|
||||
for future in asyncio.as_completed([scraper.view(p) for p in posts]):
|
||||
await future
|
||||
|
||||
titleWrapTags = document.select('.gallview-tit-box .ginfo2 > li')
|
||||
|
||||
timeTag = titleWrapTags.pop()
|
||||
authorTag = titleWrapTags.pop()
|
||||
authorAnchorTag = authorTag.select_one('a')
|
||||
|
||||
titleParts = (
|
||||
document
|
||||
.select_one('.gallview-tit-box .tit')
|
||||
.get_text(strip=True)
|
||||
.split('\r\n')
|
||||
)
|
||||
|
||||
post.title = titleParts.pop().strip()
|
||||
post.category = titleParts.pop()[1:-1].strip()
|
||||
|
||||
if authorAnchorTag:
|
||||
post.authorId = re.findall(r'\/\w+$', authorAnchorTag.attrs.get('href'))[0][1:]
|
||||
post.authorName = authorAnchorTag.get_text(strip=True)
|
||||
else:
|
||||
authorParts = authorTag.get_text(strip=True).split('(')
|
||||
post.authorId = authorParts[1][:-1].strip()
|
||||
post.authorName = authorParts[0].strip()
|
||||
|
||||
post.created_at = (
|
||||
datetime
|
||||
.strptime(timeTag.get_text(strip=True), '%Y.%m.%d %H:%M')
|
||||
.replace(tzinfo=ZoneInfo('Asia/Seoul'))
|
||||
)
|
||||
|
||||
post.body = document.select_one('.thum-txtin')
|
||||
|
||||
# ㅋㅋㅋㅋz
|
||||
if post.authorName != 'ori':
|
||||
return
|
||||
|
||||
print(f'yoinked {post.boardId}/{post.postId}, written by {post.authorName}: {post.title}')
|
||||
|
||||
if attachment_path:
|
||||
self.replace_attachment(post, attachment_path)
|
||||
|
||||
if document_path:
|
||||
(document_path / f'{post.boardId}_{post.postId}.html').write_text(str(post.body))
|
||||
if __name__ == '__main__':
|
||||
asyncio.run(main())
|
||||
|
Reference in New Issue
Block a user