refactor structure for sane workflow :)
This commit is contained in:
36
main.py
Normal file
36
main.py
Normal file
@@ -0,0 +1,36 @@
|
|||||||
|
from pathlib import Path
|
||||||
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||||
|
|
||||||
|
from requests.adapters import Retry
|
||||||
|
|
||||||
|
from adapter import TimeoutHTTPAdapter
|
||||||
|
from scraper import Scraper
|
||||||
|
|
||||||
|
|
||||||
|
scraper = Scraper(concurrency=5)
|
||||||
|
|
||||||
|
retries = Retry(total=5, backoff_factor=0.1, status_forcelist=[404])
|
||||||
|
scraper.mount('https://', TimeoutHTTPAdapter(timeout=1, max_retries=retries))
|
||||||
|
|
||||||
|
document_path = Path('archives')
|
||||||
|
attachment_path = document_path / 'attachments'
|
||||||
|
attachment_path.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
try:
|
||||||
|
for offset in range(22760681, 22760681 + (10000 * 100), 10000):
|
||||||
|
with ThreadPoolExecutor() as executor:
|
||||||
|
posts = scraper.search('aoegame', 'name', -offset, 'ori')
|
||||||
|
print(f'searching offset {-offset}, found {len(posts)} post(s)')
|
||||||
|
as_completed([
|
||||||
|
executor.submit(
|
||||||
|
scraper.view,
|
||||||
|
post,
|
||||||
|
document_path=document_path,
|
||||||
|
attachment_path=attachment_path
|
||||||
|
)
|
||||||
|
for post in scraper.search('aoegame', 'name', -offset, 'ori')
|
||||||
|
])
|
||||||
|
|
||||||
|
|
||||||
|
except KeyboardInterrupt:
|
||||||
|
print(':-)')
|
1
models/__init__.py
Normal file
1
models/__init__.py
Normal file
@@ -0,0 +1 @@
|
|||||||
|
from .post import *
|
17
models/post.py
Normal file
17
models/post.py
Normal file
@@ -0,0 +1,17 @@
|
|||||||
|
from typing import Optional
|
||||||
|
from dataclasses import dataclass
|
||||||
|
from datetime import datetime
|
||||||
|
|
||||||
|
from bs4 import Tag
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class Post:
|
||||||
|
boardId: str
|
||||||
|
postId: int
|
||||||
|
authorId: Optional[str] = None
|
||||||
|
authorName: Optional[str] = None
|
||||||
|
category: Optional[str] = None
|
||||||
|
title: Optional[str] = None
|
||||||
|
body: Optional[Tag] = None
|
||||||
|
created_at: Optional[datetime] = None
|
48
scraper.py
48
scraper.py
@@ -1,8 +1,7 @@
|
|||||||
import re
|
import re
|
||||||
import hashlib
|
import hashlib
|
||||||
|
|
||||||
from typing import Optional, Tuple, List
|
from typing import Tuple, List
|
||||||
from dataclasses import dataclass
|
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from zoneinfo import ZoneInfo
|
from zoneinfo import ZoneInfo
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
@@ -15,22 +14,10 @@ from requests import Session
|
|||||||
from requests.adapters import Retry
|
from requests.adapters import Retry
|
||||||
from requests.cookies import create_cookie
|
from requests.cookies import create_cookie
|
||||||
|
|
||||||
from bs4 import BeautifulSoup, Tag
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
from cgi import parse_header
|
from cgi import parse_header
|
||||||
from adapter import TimeoutHTTPAdapter
|
from models import Post
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
|
||||||
class Post:
|
|
||||||
boardId: str
|
|
||||||
postId: int
|
|
||||||
authorId: Optional[str] = None
|
|
||||||
authorName: Optional[str] = None
|
|
||||||
category: Optional[str] = None
|
|
||||||
title: Optional[str] = None
|
|
||||||
body: Optional[Tag] = None
|
|
||||||
created_at: Optional[datetime] = None
|
|
||||||
|
|
||||||
|
|
||||||
class Scraper(Session):
|
class Scraper(Session):
|
||||||
@@ -156,32 +143,3 @@ class Scraper(Session):
|
|||||||
|
|
||||||
if document_path:
|
if document_path:
|
||||||
(document_path / f'{post.boardId}_{post.postId}.html').write_text(str(post.body))
|
(document_path / f'{post.boardId}_{post.postId}.html').write_text(str(post.body))
|
||||||
|
|
||||||
|
|
||||||
scraper = Scraper(concurrency=5)
|
|
||||||
|
|
||||||
retries = Retry(total=5, backoff_factor=0.1, status_forcelist=[404])
|
|
||||||
scraper.mount('https://', TimeoutHTTPAdapter(timeout=1, max_retries=retries))
|
|
||||||
|
|
||||||
document_path = Path('archives')
|
|
||||||
attachment_path = document_path / 'attachments'
|
|
||||||
attachment_path.mkdir(parents=True, exist_ok=True)
|
|
||||||
|
|
||||||
try:
|
|
||||||
for offset in range(22760681, 22760681 + (10000 * 100), 10000):
|
|
||||||
with ThreadPoolExecutor() as executor:
|
|
||||||
posts = scraper.search('aoegame', 'name', -offset, 'ori')
|
|
||||||
print(f'searching offset {-offset}, found {len(posts)} post(s)')
|
|
||||||
as_completed([
|
|
||||||
executor.submit(
|
|
||||||
scraper.view,
|
|
||||||
post,
|
|
||||||
document_path=document_path,
|
|
||||||
attachment_path=attachment_path
|
|
||||||
)
|
|
||||||
for post in scraper.search('aoegame', 'name', -offset, 'ori')
|
|
||||||
])
|
|
||||||
|
|
||||||
|
|
||||||
except KeyboardInterrupt:
|
|
||||||
print(':-)')
|
|
||||||
|
Reference in New Issue
Block a user