1
0

Compare commits

...

3 Commits

Author SHA1 Message Date
4efd92bcb0 fix: missing chunk, fr this time 2025-08-04 14:41:21 +09:00
753da82a48 fix: missing chunk from attachment files 2025-08-04 13:50:15 +09:00
f3344c57c3 fix: remove hardcoded integer 2025-08-04 13:31:48 +09:00

View File

@@ -18,18 +18,22 @@ from .typings import BoardPath, SearchType
from models import Attachment, Post
LIST_MAX_POSTS = 200
class Scraper(ClientSession):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
# 모바일 페이지 및 첨부 파일 요청 시 필요한 기본 헤더 값
self.headers['Accept'] = '*/*'
self.headers['User-Agent'] = '(Android)'
self.headers['Referer'] = 'https://m.dcinside.com/board/aoegame'
# 게시글 목록 조회로 한 번에 불러올 항목 수
self.cookie_jar.update_cookies({
'list_count': '200'
'list_count': LIST_MAX_POSTS
})
async def __aenter__(self) -> 'Scraper':
@@ -77,14 +81,14 @@ class Scraper(ClientSession):
return [
Post(
id=int(re.findall(r'/\d+', tag.select_one('a[href]:first-child')['href'])[0][1:]),
id=int(re.findall(r'/\d+', tag.select_one('a[href]')['href'])[0][1:]),
board_id=board_id,
board_path=board_path
)
for tag in document.select('.gall-detail-lnktb')
]
async def view(self, post: Post):
async def view(self, post: Post) -> Post:
"""
게시글 내용을 조회합니다
@@ -137,8 +141,8 @@ class Scraper(ClientSession):
# 불필요한 본문 요소 제거
for tag in post.body.select('script, style'):
tag.extract()
print(f'{post.board_id}/{post.id}: {post.title}')
return post
async def fetch_voice(self, id: str):
"""
@@ -173,18 +177,12 @@ class Scraper(ClientSession):
return document.select_one('source')['src']
async def download_attachment(
self,
url: str,
save_dir: Path,
chunk_size = 8192
) -> Attachment:
async def download_attachment(self, url: str, save_dir: Path) -> Attachment:
"""
첨부 파일을 받아옵니다
:param url: 받아올 첨부 파일의 주소
:param save_dir: 받아질 로컬 디렉터리 경로
:param chunk_size: 청크 크기
"""
url_parsed = urlparse.urlparse(url)
url_params = urlparse.parse_qs(url_parsed.query)
@@ -205,9 +203,10 @@ class Scraper(ClientSession):
with NamedTemporaryFile('wb') as temp_file:
async with await self.get(attachment.source_url) as response:
async for chunk in response.content.iter_chunked(chunk_size):
async for chunk, _ in response.content.iter_chunks():
temp_file.write(chunk)
hash.update(chunk)
temp_file.flush()
# Content-Type 헤더로부터 확장자 알아내기
attachment.source_suffix = mimetypes.guess_extension(response.content_type)
@@ -241,7 +240,9 @@ class Scraper(ClientSession):
urls = [
# 이미지
*[
tag['data-original'].strip()
# 움짤은 자동 변환 후 `data-gif`와 `data-mp4`로 반환됨
# TODO: bad code, clean shit up
tag.attrs.get('data-mp4', tag['data-original']).strip()
for tag in post.body.select('img[data-original]')
],