Skip to content

Commit

Permalink
tumblr_backup: Implement wget module
Browse files Browse the repository at this point in the history
Uses urllib3 and a temporary file to make downloading media smoother.
Also supports various modes of timestamping, inspired by wget.

Included revisions:
- Fix urllib3 status code handling
- Cleanup fsync logic
- Specific type: ignore comments
- Cleanup file_isfat and merge into util.py
- note_scraper: Smarter handling of cookies and error messages
- Remove enospc handler
- Use os.supports_dir_fd
- No more log_queue
- Global, abstract no_internet
- Support "canceled" WaitOnMainThread status

Fixes bbolli#201
  • Loading branch information
cebtenzzre committed Sep 29, 2020
1 parent e1d71c1 commit b2402e9
Show file tree
Hide file tree
Showing 4 changed files with 1,188 additions and 193 deletions.
86 changes: 51 additions & 35 deletions note_scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,17 +2,16 @@

from __future__ import absolute_import, division, print_function, with_statement

import contextlib
import itertools
import re
import ssl
import sys
import time
import traceback
import warnings

from bs4 import BeautifulSoup

from util import ConnectionFile, HAVE_SSL_CTX, HTTP_TIMEOUT, to_bytes, to_native_str
from util import ConnectionFile, is_dns_working, to_bytes, to_native_str

try:
from typing import TYPE_CHECKING
Expand All @@ -23,27 +22,41 @@
from typing import List, Text

try:
from http.client import HTTPException
from http.cookiejar import MozillaCookieJar
except ImportError:
from httplib import HTTPException # type: ignore[no-redef]
from cookielib import MozillaCookieJar # type: ignore[no-redef]

try:
from http.cookiejar import MozillaCookieJar
import requests
except ImportError:
from cookielib import MozillaCookieJar # type: ignore[no-redef]
try:
from pip._vendor import requests # type: ignore[no-redef]
except ImportError:
raise RuntimeError('The requests module is required for note scraping. '
'Please install it with pip or your package manager.')

try:
from urllib.error import HTTPError, URLError
from urllib.parse import quote, urlparse, urlsplit, urlunsplit
from urllib.request import BaseHandler, HTTPCookieProcessor, HTTPSHandler, build_opener
except ImportError:
from urllib import quote # type: ignore[attr-defined,no-redef]
from urllib2 import (BaseHandler, HTTPCookieProcessor, HTTPSHandler, HTTPError, URLError, # type: ignore[no-redef]
build_opener)
from urlparse import urlparse, urlsplit, urlunsplit # type: ignore[no-redef]

try:
from urllib3 import Retry
from urllib3.exceptions import HTTPError, InsecureRequestWarning
except ImportError:
try:
# pip includes urllib3
from pip._vendor.urllib3 import Retry
from pip._vendor.urllib3.exceptions import HTTPError, InsecureRequestWarning
except ImportError:
raise RuntimeError('The urllib3 module is required. Please install it with pip or your package manager.')

EXIT_SUCCESS = 0
EXIT_SAFE_MODE = 2
EXIT_NO_INTERNET = 3

HTTP_RETRY = Retry(3, connect=False)

# Globals
post_url = None
Expand Down Expand Up @@ -73,10 +86,11 @@ def __init__(self, noverify, cookiefile, notes_limit):
self.notes_limit = notes_limit
self.lasturl = None

handlers = [] # type: List[BaseHandler] # pytype: disable=invalid-annotation
if HAVE_SSL_CTX:
context = ssl._create_unverified_context() if noverify else ssl.create_default_context()
handlers.append(HTTPSHandler(context=context))
self.session = requests.Session()
self.session.verify = not noverify
for adapter in self.session.adapters.values():
adapter.max_retries = HTTP_RETRY

if cookiefile is not None:
cookies = MozillaCookieJar(cookiefile)
cookies.load()
Expand All @@ -88,9 +102,7 @@ def __init__(self, noverify, cookiefile, notes_limit):
cookie.expires = None
cookie.discard = True

handlers.append(HTTPCookieProcessor(cookies))

self.opener = build_opener(*handlers)
self.session.cookies = cookies # type: ignore[assignment]

@classmethod
def quote_unsafe(cls, string):
Expand Down Expand Up @@ -140,19 +152,23 @@ def ratelimit_sleep(self, headers):
def urlopen(self, iri):
self.lasturl = iri
uri = self.iri_to_uri(iri)

try_count = 0
while True:
try:
with contextlib.closing(self.opener.open(uri, timeout=HTTP_TIMEOUT)) as resp:
try_count += 1
parsed_uri = urlparse(resp.geturl())
if re.match(r'(www\.)?tumblr\.com', parsed_uri.netloc) and parsed_uri.path == '/safe-mode':
sys.exit(EXIT_SAFE_MODE)
return resp.read().decode('utf-8', errors='ignore')
except HTTPError as e:
if e.code == 429 and try_count < self.TRY_LIMIT and self.ratelimit_sleep(e.headers):
with self.session.get(uri) as resp:
try_count += 1
parsed_uri = urlparse(resp.url)
if re.match(r'(www\.)?tumblr\.com', parsed_uri.netloc) and parsed_uri.path == '/safe-mode':
sys.exit(EXIT_SAFE_MODE)
if resp.status_code == 429 and try_count < self.TRY_LIMIT and self.ratelimit_sleep(resp.headers):
continue
raise
if 200 <= resp.status_code < 300:
return resp.content.decode('utf-8', errors='ignore')
log(iri, 'Unexpected response status: HTTP {} {}{}'.format(
resp.status_code, resp.reason,
'' if resp.status_code == 404 else '\nHeaders: {}'.format(resp.headers),
))
return None

@staticmethod
def get_more_link(soup, base, notes_url):
Expand Down Expand Up @@ -220,6 +236,10 @@ def main(stdout_conn, msg_conn, post_url_, ident_, noverify, notes_limit, cookie
global post_url, ident, msg_pipe
post_url, ident = post_url_, ident_

if noverify:
# Hide the InsecureRequestWarning from urllib3
warnings.filterwarnings('ignore', category=InsecureRequestWarning)

with ConnectionFile(msg_conn, 'w') as msg_pipe:
crawler = WebCrawler(noverify, cookiefile, notes_limit)

Expand All @@ -228,13 +248,9 @@ def main(stdout_conn, msg_conn, post_url_, ident_, noverify, notes_limit, cookie
except KeyboardInterrupt:
sys.exit() # Ignore these so they don't propogate into the parent
except HTTPError as e:
log(crawler.lasturl, 'HTTP Error {} {}'.format(e.code, e.reason))
sys.exit()
except URLError as e:
log(crawler.lasturl, 'URL Error: {}'.format(e.reason))
sys.exit()
except HTTPException as e:
log(crawler.lasturl, 'HTTP Exception: {}'.format(e))
if not is_dns_working(timeout=5):
sys.exit(EXIT_NO_INTERNET)
log(crawler.lasturl, e)
sys.exit()
except Exception:
log(crawler.lasturl, 'Caught an exception')
Expand Down
Loading

0 comments on commit b2402e9

Please sign in to comment.