Paste2.org - Viewing Paste 3cwm4MxJ

#!/usr/bin/env python
# encoding: utf-8
# CODE FROM GREYMASK, WITH SOME MODIFICATIONS, POSTED WITH APOLOGIES AND
# WITHOUT PERMISSION, because the original post is MIA and Tumblr is melting
# down.
#
# TO USE ON MAC: copy this whole code snippet (from the line that says "#!/usr/bin/env python"
# alllllllll the way down to "sys.exit(0 if tb.total_count else 1)" and use TextEdit to
# save it as "tumblr_backup.py". You will probably also need to download xmltramp:
#
# http://www.aaronsw.com/2002/xmltramp/xmltramp.py
#
# and save it in the same directory as tumblr_backup.py. Then, open Terminal
# (/Applications/Terminal), and go to the directory where you saved the files (if you used
# your Downloads folder, you can do this by typing:
#
# cd ~/Downloads
#
# in Terminal and then hitting return). Then type:
#
# /usr/bin/python2.7 tumblr_backup.py -x YOUR_TUMBLR_BLOGNAME_HERE
#
# e.g.:
#
# /usr/bin/python2.7 tumblr_backup.py -x staff
#
# to back up the @staff Tumblr. It will create the folder
# "YOUR_TUMBLR_BLOGNAME_HERE" in your working directory, inside which will be
# a local copy of your blog. The script will take quite some time to run, and--
# especially if you've been on Tumblr for a long time or post frequently--
# the saved file structure may be quite large. I have about 6,000 posts and
# my backup is about 4GB.
#
# Recommendation: if you want to leave this running overnight, put a movie on in VLC,
# put it on repeat, turn your volume and brightness down, and leave your computer
# powered on and plugged in somewhere out of the way. You don't want your computer to
# go to sleep in the middle, it'll get confused.
#
# Good luck everyone!
#
# standard Python library imports
from __future__ import with_statement
import os
import sys
import urllib
import urllib2
from xml.sax.saxutils import escape
from xml.sax import SAXException
import codecs
import imghdr
from collections import defaultdict
import time
import locale
from glob import glob
import re
# extra required packages
import xmltramp
join = os.path.join
# add another JPEG recognizer
# see http://www.garykessler.net/library/file_sigs.html
def test_jpg(h, f):
if h[:3] == '\xFF\xD8\xFF' and h[3] in "\xDB\xE0\xE1\xE2\xE3":
return 'jpg'
imghdr.tests.append(test_jpg)
# variable directory names, will be set in TumblrBackup.backup()
save_folder = ''
image_folder = ''
# constant names
root_folder = os.getcwdu()
post_dir = 'posts'
xml_dir = 'xml'
image_dir = 'images'
archive_dir = 'archive'
theme_dir = 'theme'
backup_css = 'backup.css'
custom_css = 'custom.css'
avatar_base = 'avatar'
blog_name = ''
post_header = ''
post_ext = '.html'
have_custom_css = False
# ensure the right date/time format
try:
locale.setlocale(locale.LC_TIME, '')
except locale.Error:
pass
encoding = 'utf-8'
time_encoding = locale.getlocale(locale.LC_TIME)[1] or encoding
def log(account, s):
if not options.quiet:
if account:
sys.stdout.write('%s: ' % account)
sys.stdout.write(s[:-1] + ' ' * 20 + s[-1:])
sys.stdout.flush()
def mkdir(dir, recursive=False):
if not os.path.exists(dir):
if recursive:
os.makedirs(dir)
else:
os.mkdir(dir)
def path_to(*parts):
return join(save_folder, *parts)
def open_file(open_fn, parts):
if len(parts) > 1:
mkdir(path_to(*parts[:-1]))
return open_fn(path_to(*parts))
def open_text(*parts):
return open_file(
lambda f: codecs.open(f, 'w', encoding, 'xmlcharrefreplace'), parts
)
def open_image(*parts):
return open_file(lambda f: open(f, 'wb'), parts)
def strftime(format, t=None):
if t is None:
t = time.localtime()
return time.strftime(format, t).decode(time_encoding)
def get_api_url(account):
"""construct the tumblr API URL"""
global blog_name
blog_name = account
if '.' not in account:
blog_name += '.tumblr.com'
base = 'http://' + blog_name + '/api/read'
if options.private:
password_manager = urllib2.HTTPPasswordMgrWithDefaultRealm()
password_manager.add_password(None, base, '', options.private)
auth_manager = urllib2.HTTPBasicAuthHandler(password_manager)
opener = urllib2.build_opener(auth_manager)
urllib2.install_opener(opener)
return base
def xmlparse(url, data=None):
for _ in range(10):
try:
resp = urllib2.urlopen(url, data)
except (urllib2.URLError, urllib2.HTTPError) as e:
sys.stderr.write('%s getting %s\n' % (e, url))
continue
if resp.info().gettype() == 'text/xml':
break
else:
return None
xml = resp.read()
try:
doc = xmltramp.parse(xml)
except SAXException as e:
sys.stderr.write('%s %r\n\n%r\n\n%s\n' % (resp.info().gettype(), resp.msg, e, xml))
return None
return doc if doc._name == 'tumblr' else None
def save_image(image_url):
"""saves an image if not saved yet, returns the local file name"""
def _url(fn):
return u'../%s/%s' % (image_dir, fn)
image_filename = image_url.split('/')[-1]
glob_filter = '' if '.' in image_filename else '.*'
# check if a file with this name already exists
image_glob = glob(join(image_folder, image_filename + glob_filter))
if image_glob:
return _url(os.path.split(image_glob[0])[1])
# download the image data
try:
image_response = urllib2.urlopen(image_url)
except urllib2.HTTPError:
# return the original URL
return image_url
image_data = image_response.read()
image_response.close()
# determine the file type if it's unknown
if '.' not in image_filename:
image_type = imghdr.what(None, image_data[:32])
if image_type:
image_filename += '.' + image_type.replace('jpeg', 'jpg')
# save the image
with open_image(image_dir, image_filename) as image_file:
image_file.write(image_data)
return _url(image_filename)
def save_style():
with open_text(backup_css) as css:
css.write('''\
body { width: 720px; margin: 0 auto; }
img { max-width: 720px; }
blockquote { margin-left: 0; border-left: 8px #999 solid; padding: 0 24px; }
.archive h1, .subtitle, article { padding-bottom: 0.75em; border-bottom: 1px #ccc dotted; }
.post a.llink { display: none; }
.meta a { text-decoration: none; }
.avatar { float: right; }
''')
def header(heading, title='', body_class='', subtitle='', avatar=''):
root_rel = '' if body_class == 'index' else '../'
css_rel = root_rel + (custom_css if have_custom_css else backup_css)
if body_class:
body_class = ' class=' + body_class
h = u'''<!DOCTYPE html>
<meta charset=%s>
<title>%s</title>
<link rel=stylesheet href=%s>
<body%s>
''' % (encoding, heading, css_rel, body_class)
if avatar:
h += '<img src=%s%s/%s alt=Avatar class=avatar>\n' % (root_rel, theme_dir, avatar)
if title:
h += u'<h1>%s</h1>\n' % title
if subtitle:
h += u'<p class=subtitle>%s</p>\n' % subtitle
return h
def get_avatar():
try:
resp = urllib2.urlopen('http://api.tumblr.com/v2/blog/%s/avatar' % blog_name)
avatar_data = resp.read()
except:
return
avatar_file = avatar_base + '.' + imghdr.what(None, avatar_data[:32])
with open_image(theme_dir, avatar_file) as f:
f.write(avatar_data)
def get_style():
"""Get the blog's CSS by brute-forcing it from the home page.
The v2 API has no method for getting the style directly.
See https://groups.google.com/d/msg/tumblr-api/f-rRH6gOb6w/sAXZIeYx5AUJ"""
try:
resp = urllib2.urlopen('http://%s/' % blog_name)
page_data = resp.read()
except:
return
match = re.search(r'(?s)<style type=.text/css.>(.*?)</style>', page_data)
if match:
css = match.group(1).strip().decode(encoding, 'replace')
if not css:
return
css = css.replace('\r', '').replace('\n ', '\n')
with open_text(theme_dir, 'style.css') as f:
f.write(css + '\n')
class TumblrBackup:
def __init__(self):
self.total_count = 0
def build_index(self):
for f in glob(path_to(post_dir, '*.html')):
post = LocalPost(f)
self.index[post.tm.tm_year][post.tm.tm_mon].append(post)
def save_index(self):
f = glob(path_to(theme_dir, avatar_base + '.*'))
avatar = os.path.split(f[0])[1] if f else None
with open_text('index.html') as idx:
idx.write(header(self.title, self.title, body_class='index',
subtitle=self.subtitle, avatar=avatar
))
for year in sorted(self.index.keys(), reverse=options.reverse_index):
self.save_year(idx, year)
idx.write('<p>Generated on %s.</p>\n' % strftime('%x %X'))
def save_year(self, idx, year):
idx.write('<h3>%s</h3>\n<ul>\n' % year)
for month in sorted(self.index[year].keys(), reverse=options.reverse_index):
tm = time.localtime(time.mktime([year, month, 3, 0, 0, 0, 0, 0, -1]))
month_name = self.save_month(year, month, tm)
idx.write(' <li><a href=%s/%s title="%d post(s)">%s</a></li>\n' % (
archive_dir, month_name, len(self.index[year][month]),
strftime('%B', tm)
))
idx.write('</ul>\n\n')
def save_month(self, year, month, tm):
file_name = '%d-%02d.html' % (year, month)
with open_text(archive_dir, file_name) as arch:
arch.write('\n\n'.join([
header(self.title, strftime('%B %Y', tm), body_class='archive'),
'\n'.join(p.get_post() for p in sorted(
self.index[year][month], key=lambda x: x.date, reverse=options.reverse_month
)),
'<p><a href=../ rel=contents>Index</a></p>\n'
]))
return file_name
def backup(self, account):
"""makes single files and an index for every post on a public Tumblr blog account"""
base = get_api_url(account)
# make sure there are folders to save in
global save_folder, image_folder, post_ext, post_dir, have_custom_css
if options.blosxom:
save_folder = root_folder
post_ext = '.txt'
post_dir = os.curdir
post_class = BlosxomPost
else:
save_folder = join(root_folder, account)
image_folder = path_to(image_dir)
post_class = TumblrPost
have_custom_css = os.access(path_to(custom_css), os.R_OK)
mkdir(save_folder, True)
self.post_count = 0
# prepare the period start and end timestamps
if options.period:
i = 0; tm = [int(options.period[:4]), 1, 1, 0, 0, 0, 0, 0, -1]
if len(options.period) >= 6:
i = 1; tm[1] = int(options.period[4:6])
if len(options.period) == 8:
i = 2; tm[2] = int(options.period[6:8])
p_start = time.mktime(tm)
tm[i] += 1
p_stop = time.mktime(tm)
# get the highest post id already saved
ident_max = None
if options.incremental:
try:
ident_max = max(
long(os.path.splitext(os.path.split(f)[1])[0])
for f in glob(path_to(post_dir, '*' + post_ext))
)
log(account, "Backing up posts after %d\r" % ident_max)
except ValueError: # max() arg is an empty sequence
pass
else:
log(account, "Getting basic information\r")
# start by calling the API with just a single post
soup = xmlparse(base + '?num=1')
if not soup:
return
# collect all the meta information
tumblelog = soup.tumblelog
try:
self.title = escape(tumblelog('title'))
except KeyError:
self.title = account
self.subtitle = unicode(tumblelog)
# use the meta information to create a HTML header
global post_header
post_header = header(self.title, body_class='post')
# find the total number of posts
total_posts = options.count or int(soup.posts('total'))
last_post = options.skip + total_posts
def _backup(posts):
for p in sorted(posts, key=lambda x: long(x('id')), reverse=True):
post = post_class(p)
if ident_max and long(post.ident) <= ident_max:
return False
if options.period:
if post.date >= p_stop:
continue
if post.date < p_start:
return False
post.generate_content()
if post.error:
sys.stderr.write('%s%s\n' % (post.error, 50 * ' '))
post.save_post()
self.post_count += 1
return True
# Get the XML entries from the API, which we can only do for max 50 posts at once.
# Posts "arrive" in reverse chronological order. Post #0 is the most recent one.
MAX = 50
for i in range(options.skip, last_post, MAX):
# find the upper bound
j = min(i + MAX, last_post)
log(account, "Getting posts %d to %d of %d\r" % (i, j - 1, total_posts))
# silent exception escape hatch added by Gins 2018.12.03
try:
soup = xmlparse('%s?num=%d&start=%d' % (base, j - i, i))
except:
pass
if soup is None:
return
if not _backup(soup.posts['post':]):
break
if not options.blosxom and self.post_count:
get_avatar()
get_style()
if not have_custom_css:
save_style()
self.index = defaultdict(lambda: defaultdict(list))
self.build_index()
self.save_index()
log(account, "%d posts backed up\n" % self.post_count)
self.total_count += self.post_count
class TumblrPost:
def __init__(self, post):
self.content = ''
self.post = post
self.xml_content = post.__repr__(1, 1)
self.ident = post('id')
self.url = post('url')
self.typ = post('type')
self.date = int(post('unix-timestamp'))
self.tm = time.localtime(self.date)
self.title = ''
self.tags = []
self.file_name = self.ident + post_ext
self.error = None
def generate_content(self):
"""generates the content for this post"""
post = self.post
content = []
def append(s, fmt=u'%s'):
# the %s conversion calls unicode() on the xmltramp element
content.append(fmt % s)
def get_try(elt):
try:
return unicode(post[elt])
except KeyError:
return ''
def append_try(elt, fmt=u'%s'):
elt = get_try(elt)
if elt:
append(elt, fmt)
if self.typ == 'regular':
self.title = get_try('regular-title')
append_try('regular-body')
elif self.typ == 'photo':
url = escape(get_try('photo-link-url'))
for p in post.photoset['photo':] if hasattr(post, 'photoset') else [post]:
src = unicode(p['photo-url'])
append(escape(self.get_image_url(src)), u'<img alt="" src="%s">')
if url:
content[-1] = '<a href="%s">%s</a>' % (url, content[-1])
content[-1] = '<p>' + content[-1] + '</p>'
if p._name == 'photo' and p('caption'):
append(p('caption'), u'<p>%s</p>')
append_try('photo-caption')
elif self.typ == 'link':
url = unicode(post['link-url'])
self.title = u'<a href="%s">%s</a>' % (escape(url),
post['link-text'] if 'link-text' in post else url
)
append_try('link-description')
elif self.typ == 'quote':
append(post['quote-text'], u'<blockquote><p>%s</p></blockquote>')
append_try('quote-source', u'<p>%s</p>')
elif self.typ == 'video':
source = unicode(post['video-source']).strip()
if source.startswith('<'):
player = source
source = ''
else:
player = unicode(post['video-player']).strip()
player = player.replace('src="//', 'src="http://')
append(player)
append_try('video-caption')
if '//' in source:
append(escape(source), u'<p><a href="%s">Original</a></p>')
elif self.typ == 'audio':
append(post['audio-player'])
append_try('audio-caption')
elif self.typ == 'answer':
self.title = post.question
append(post.answer)
elif self.typ == 'conversation':
self.title = get_try('conversation-title')
append(
'<br>\n'.join(escape(unicode(l)) for l in post.conversation['line':]),
u'<p>%s</p>'
)
else:
self.error = u"Unknown post type '%s' in post #%s" % (self.typ, self.ident)
append(escape(self.xml_content), u'<pre>%s</pre>')
self.tags = [u'%s' % t for t in post['tag':]]
self.content = '\n'.join(content)
# fix wrongly nested HTML tags
for p in ('<p>(<(%s)>)', '(</(%s)>)</p>'):
self.content = re.sub(p % 'p|ol|iframe[^>]*', r'\1', self.content)
def get_image_url(self, url):
return save_image(url)
def get_post(self):
"""returns this post in HTML"""
post = post_header + '<article class=%s id=p-%s>\n' % (self.typ, self.ident)
post += '<p class=meta><span class=date>%s</span>\n' % strftime('%x %X', self.tm)
post += u'<a class=llink href=../%s/%s>¶</a>\n' % (post_dir, self.file_name)
post += u'<a href=%s rel=canonical>●</a></p>\n' % self.url
if self.title:
post += '<h2>%s</h2>\n' % self.title
post += self.content
if self.tags:
post += u'\n<p class=tags>%s</p>' % u' '.join(u'#' + t for t in self.tags)
post += '\n</article>\n'
return post
def save_post(self):
"""saves this post locally"""
with open_text(post_dir, self.file_name) as f:
f.write(self.get_post())
os.utime(path_to(post_dir, self.file_name),
(self.date, self.date)
)
if options.xml:
with open_text(xml_dir, self.ident + '.xml') as f:
f.write(self.xml_content)
class BlosxomPost(TumblrPost):
def get_image_url(self, url):
return url
def get_post(self):
"""returns this post as a Blosxom post"""
post = self.title + '\nmeta-id: _' + self.ident + '\nmeta-url: ' + self.url
if self.tags:
post += '\nmeta-tags: ' + ' '.join(t.replace(' ', '+') for t in self.tags)
post += '\n\n' + self.content
return post
class LocalPost:
def __init__(self, post_file):
with codecs.open(post_file, 'r', encoding) as f:
self.lines = f.readlines()
# remove header and footer
while self.lines and '<article ' not in self.lines[0]:
del self.lines[0]
while self.lines and '</article>' not in self.lines[-1]:
del self.lines[-1]
self.file_name = os.path.split(post_file)[1]
self.ident = os.path.splitext(self.file_name)[0]
self.date = os.stat(post_file).st_mtime
self.tm = time.localtime(self.date)
def get_post(self):
return u''.join(self.lines)
if __name__ == '__main__':
import optparse
parser = optparse.OptionParser("Usage: %prog [options] blog-name ...",
description="Makes a local backup of Tumblr blogs."
)
parser.add_option('-q', '--quiet', action='store_true',
help="suppress progress messages"
)
parser.add_option('-i', '--incremental', action='store_true',
help="incremental backup mode"
)
parser.add_option('-x', '--xml', action='store_true',
help="save the original XML source"
)
parser.add_option('-b', '--blosxom', action='store_true',
help="save the posts in blosxom format"
)
parser.add_option('-r', '--reverse-month', action='store_false', default=True,
help="reverse the post order in the monthly archives"
)
parser.add_option('-R', '--reverse-index', action='store_false', default=True,
help="reverse the index file order"
)
parser.add_option('-a', '--auto', type='int', metavar="HOUR",
help="do a full backup at HOUR hours, otherwise do an incremental backup"
" (useful for cron jobs)"
)
parser.add_option('-n', '--count', type='int', help="save only COUNT posts")
parser.add_option('-s', '--skip', type='int', default=0,
help="skip the first SKIP posts"
)
parser.add_option('-p', '--period', help="limit the backup to PERIOD"
" ('y', 'm', 'd' or YYYY[MM[DD]])"
)
parser.add_option('-P', '--private', help="password for a private tumblr",
metavar='PASSWORD'
)
options, args = parser.parse_args()
if options.auto is not None:
if options.auto == time.localtime().tm_hour:
options.incremental = False
else:
options.incremental = True
if options.period:
try:
options.period = time.strftime(
{'y': '%Y', 'm': '%Y%m', 'd': '%Y%m%d'}[options.period]
)
except KeyError:
options.period = options.period.replace('-', '')
if len(options.period) not in (4, 6, 8):
parser.error("Period must be 'y', 'm', 'd' or YYYY[MM[DD]]")
if not args:
args = ['staff']
tb = TumblrBackup()
for account in args:
tb.backup(account)
sys.exit(0 if tb.total_count else 1)

Tumblr backup script, originally from Greymask—original post is MIA as of 2018/12/03.

Fixing some smart emdashes that got inserted by my autocorrect rules; added a link to xmltramp; updated comment with better instructions.