Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[tumblr] attempt to fetch high-quality inline images #2877

Merged
merged 3 commits into from
Aug 31, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions docs/configuration.rst
Original file line number Diff line number Diff line change
Expand Up @@ -2266,10 +2266,11 @@ Type
Default
``true``
Description
Download full-resolution ``photo`` images.
Download full-resolution ``photo`` and ``inline`` images.

For each photo with "maximum" resolution
(width equal to 2048 or height equal to 3072),
(width equal to 2048 or height equal to 3072)
or each inline image,
use an extra HTTP request to find the URL to its full-resolution version.


Expand Down
34 changes: 24 additions & 10 deletions gallery_dl/extractor/tumblr.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,14 +14,6 @@
import re


def _original_inline_image(url):
return re.sub(
(r"https?://(\d+\.media\.tumblr\.com(?:/[0-9a-f]+)?"
r"/tumblr(?:_inline)?_[^_]+)_\d+\.([0-9a-z]+)"),
r"https://\1_1280.\2", url
)


def _original_video(url):
return re.sub(
(r"https?://((?:vt|vtt|ve)(?:\.media)?\.tumblr\.com"
Expand Down Expand Up @@ -141,7 +133,7 @@ def items(self):
# API response, but they can't contain images/videos anyway
body = post["reblog"]["comment"] + post["reblog"]["tree_html"]
for url in re.findall('<img src="([^"]+)"', body):
url = _original_inline_image(url)
url = self._original_inline_image(url)
posts.append(self._prepare_image(url, post.copy()))
for url in re.findall('<source src="([^"]+)"', body):
url = _original_video(url)
Expand Down Expand Up @@ -221,7 +213,21 @@ def _skip_reblog_same_blog(self, post):
return self.blog != post.get("reblogged_root_uuid")

def _original_image(self, url):
url = url.replace("/s2048x3072/", "/s99999x99999/", 1)
return self._update_image_token(
url.replace("/s2048x3072/", "/s99999x99999/", 1))

def _original_inline_image(self, url):
if self.original:
url, n = re.subn(r"/s\d+x\d+/", "/s99999x99999/", url, 1)
if n:
return self._update_image_token(url)
return re.sub(
(r"https?://(\d+\.media\.tumblr\.com(?:/[0-9a-f]+)?"
r"/tumblr(?:_inline)?_[^_]+)_\d+\.([0-9a-z]+)"),
r"https://\1_1280.\2", url
)

def _update_image_token(self, url):
headers = {"Accept": "text/html,*/*;q=0.8"}
response = self.request(url, headers=headers)
return text.extract(response.text, '" src="', '"')[0]
Expand Down Expand Up @@ -305,6 +311,14 @@ class TumblrPostExtractor(TumblrExtractor):
("https://mikf123.tumblr.com/post/181022380064/chat-post", {
"count": 0,
}),
("https://kichatundk.tumblr.com/post/654953419288821760", {
"count": 2, # high-quality images (#1846)
"content": "d6fcc7b6f750d835d55c7f31fa3b63be26c9f89b",
}),
("https://hameru-is-cool.tumblr.com/post/639261855227002880", {
"count": 2, # high-quality images (#1344)
"content": "6bc19a42787e46e1bba2ef4aeef5ca28fcd3cd34",
}),
("https://mikf123.tumblr.com/image/689860196535762944", {
"pattern": r"^https://\d+\.media\.tumblr\.com"
r"/134791621559a79793563b636b5fe2c6"
Expand Down