1. #!/usr/bin/env python
  2. # encoding: utf-8
  3. # CODE FROM GREYMASK, WITH SOME MODIFICATIONS, POSTED WITH APOLOGIES AND
  4. # WITHOUT PERMISSION, because the original post is MIA and Tumblr is melting
  5. # down.
  6. #
  7. # TO USE ON MAC: copy this whole code snippet (from the line that says "#!/usr/bin/env python"
  8. # alllllllll the way down to "sys.exit(0 if tb.total_count else 1)" and use TextEdit to
  9. # save it as "tumblr_backup.py". You will probably also need to download xmltramp:
  10. #
  11. # http://www.aaronsw.com/2002/xmltramp/xmltramp.py
  12. #
  13. # and save it in the same directory as tumblr_backup.py. Then, open Terminal
  14. # (/Applications/Terminal), and go to the directory where you saved the files (if you used
  15. # your Downloads folder, you can do this by typing:
  16. #
  17. # cd ~/Downloads
  18. #
  19. # in Terminal and then hitting return). Then type:
  20. #
  21. # /usr/bin/python2.7 tumblr_backup.py -x YOUR_TUMBLR_BLOGNAME_HERE
  22. #
  23. # e.g.:
  24. #
  25. # /usr/bin/python2.7 tumblr_backup.py -x staff
  26. #
  27. # to back up the @staff Tumblr. It will create the folder
  28. # "YOUR_TUMBLR_BLOGNAME_HERE" in your working directory, inside which will be
  29. # a local copy of your blog. The script will take quite some time to run, and--
  30. # especially if you've been on Tumblr for a long time or post frequently--
  31. # the saved file structure may be quite large. I have about 6,000 posts and
  32. # my backup is about 4GB.
  33. #
  34. # Recommendation: if you want to leave this running overnight, put a movie on in VLC,
  35. # put it on repeat, turn your volume and brightness down, and leave your computer
  36. # powered on and plugged in somewhere out of the way. You don't want your computer to
  37. # go to sleep in the middle, it'll get confused.
  38. #
  39. # Good luck everyone!
  40. #
  41. # standard Python library imports
  42. from __future__ import with_statement
  43. import os
  44. import sys
  45. import urllib
  46. import urllib2
  47. from xml.sax.saxutils import escape
  48. from xml.sax import SAXException
  49. import codecs
  50. import imghdr
  51. from collections import defaultdict
  52. import time
  53. import locale
  54. from glob import glob
  55. import re
  56. # extra required packages
  57. import xmltramp
  58. join = os.path.join
  59. # add another JPEG recognizer
  60. # see http://www.garykessler.net/library/file_sigs.html
  61. def test_jpg(h, f):
  62. if h[:3] == '\xFF\xD8\xFF' and h[3] in "\xDB\xE0\xE1\xE2\xE3":
  63. return 'jpg'
  64. imghdr.tests.append(test_jpg)
  65. # variable directory names, will be set in TumblrBackup.backup()
  66. save_folder = ''
  67. image_folder = ''
  68. # constant names
  69. root_folder = os.getcwdu()
  70. post_dir = 'posts'
  71. xml_dir = 'xml'
  72. image_dir = 'images'
  73. archive_dir = 'archive'
  74. theme_dir = 'theme'
  75. backup_css = 'backup.css'
  76. custom_css = 'custom.css'
  77. avatar_base = 'avatar'
  78. blog_name = ''
  79. post_header = ''
  80. post_ext = '.html'
  81. have_custom_css = False
  82. # ensure the right date/time format
  83. try:
  84. locale.setlocale(locale.LC_TIME, '')
  85. except locale.Error:
  86. pass
  87. encoding = 'utf-8'
  88. time_encoding = locale.getlocale(locale.LC_TIME)[1] or encoding
  89. def log(account, s):
  90. if not options.quiet:
  91. if account:
  92. sys.stdout.write('%s: ' % account)
  93. sys.stdout.write(s[:-1] + ' ' * 20 + s[-1:])
  94. sys.stdout.flush()
  95. def mkdir(dir, recursive=False):
  96. if not os.path.exists(dir):
  97. if recursive:
  98. os.makedirs(dir)
  99. else:
  100. os.mkdir(dir)
  101. def path_to(*parts):
  102. return join(save_folder, *parts)
  103. def open_file(open_fn, parts):
  104. if len(parts) > 1:
  105. mkdir(path_to(*parts[:-1]))
  106. return open_fn(path_to(*parts))
  107. def open_text(*parts):
  108. return open_file(
  109. lambda f: codecs.open(f, 'w', encoding, 'xmlcharrefreplace'), parts
  110. )
  111. def open_image(*parts):
  112. return open_file(lambda f: open(f, 'wb'), parts)
  113. def strftime(format, t=None):
  114. if t is None:
  115. t = time.localtime()
  116. return time.strftime(format, t).decode(time_encoding)
  117. def get_api_url(account):
  118. """construct the tumblr API URL"""
  119. global blog_name
  120. blog_name = account
  121. if '.' not in account:
  122. blog_name += '.tumblr.com'
  123. base = 'http://' + blog_name + '/api/read'
  124. if options.private:
  125. password_manager = urllib2.HTTPPasswordMgrWithDefaultRealm()
  126. password_manager.add_password(None, base, '', options.private)
  127. auth_manager = urllib2.HTTPBasicAuthHandler(password_manager)
  128. opener = urllib2.build_opener(auth_manager)
  129. urllib2.install_opener(opener)
  130. return base
  131. def xmlparse(url, data=None):
  132. for _ in range(10):
  133. try:
  134. resp = urllib2.urlopen(url, data)
  135. except (urllib2.URLError, urllib2.HTTPError) as e:
  136. sys.stderr.write('%s getting %s\n' % (e, url))
  137. continue
  138. if resp.info().gettype() == 'text/xml':
  139. break
  140. else:
  141. return None
  142. xml = resp.read()
  143. try:
  144. doc = xmltramp.parse(xml)
  145. except SAXException as e:
  146. sys.stderr.write('%s %r\n\n%r\n\n%s\n' % (resp.info().gettype(), resp.msg, e, xml))
  147. return None
  148. return doc if doc._name == 'tumblr' else None
  149. def save_image(image_url):
  150. """saves an image if not saved yet, returns the local file name"""
  151. def _url(fn):
  152. return u'../%s/%s' % (image_dir, fn)
  153. image_filename = image_url.split('/')[-1]
  154. glob_filter = '' if '.' in image_filename else '.*'
  155. # check if a file with this name already exists
  156. image_glob = glob(join(image_folder, image_filename + glob_filter))
  157. if image_glob:
  158. return _url(os.path.split(image_glob[0])[1])
  159. # download the image data
  160. try:
  161. image_response = urllib2.urlopen(image_url)
  162. except urllib2.HTTPError:
  163. # return the original URL
  164. return image_url
  165. image_data = image_response.read()
  166. image_response.close()
  167. # determine the file type if it's unknown
  168. if '.' not in image_filename:
  169. image_type = imghdr.what(None, image_data[:32])
  170. if image_type:
  171. image_filename += '.' + image_type.replace('jpeg', 'jpg')
  172. # save the image
  173. with open_image(image_dir, image_filename) as image_file:
  174. image_file.write(image_data)
  175. return _url(image_filename)
  176. def save_style():
  177. with open_text(backup_css) as css:
  178. css.write('''\
  179. body { width: 720px; margin: 0 auto; }
  180. img { max-width: 720px; }
  181. blockquote { margin-left: 0; border-left: 8px #999 solid; padding: 0 24px; }
  182. .archive h1, .subtitle, article { padding-bottom: 0.75em; border-bottom: 1px #ccc dotted; }
  183. .post a.llink { display: none; }
  184. .meta a { text-decoration: none; }
  185. .avatar { float: right; }
  186. ''')
  187. def header(heading, title='', body_class='', subtitle='', avatar=''):
  188. root_rel = '' if body_class == 'index' else '../'
  189. css_rel = root_rel + (custom_css if have_custom_css else backup_css)
  190. if body_class:
  191. body_class = ' class=' + body_class
  192. h = u'''<!DOCTYPE html>
  193. <meta charset=%s>
  194. <title>%s</title>
  195. <link rel=stylesheet href=%s>
  196. <body%s>
  197. ''' % (encoding, heading, css_rel, body_class)
  198. if avatar:
  199. h += '<img src=%s%s/%s alt=Avatar class=avatar>\n' % (root_rel, theme_dir, avatar)
  200. if title:
  201. h += u'<h1>%s</h1>\n' % title
  202. if subtitle:
  203. h += u'<p class=subtitle>%s</p>\n' % subtitle
  204. return h
  205. def get_avatar():
  206. try:
  207. resp = urllib2.urlopen('http://api.tumblr.com/v2/blog/%s/avatar' % blog_name)
  208. avatar_data = resp.read()
  209. except:
  210. return
  211. avatar_file = avatar_base + '.' + imghdr.what(None, avatar_data[:32])
  212. with open_image(theme_dir, avatar_file) as f:
  213. f.write(avatar_data)
  214. def get_style():
  215. """Get the blog's CSS by brute-forcing it from the home page.
  216. The v2 API has no method for getting the style directly.
  217. See https://groups.google.com/d/msg/tumblr-api/f-rRH6gOb6w/sAXZIeYx5AUJ"""
  218. try:
  219. resp = urllib2.urlopen('http://%s/' % blog_name)
  220. page_data = resp.read()
  221. except:
  222. return
  223. match = re.search(r'(?s)<style type=.text/css.>(.*?)</style>', page_data)
  224. if match:
  225. css = match.group(1).strip().decode(encoding, 'replace')
  226. if not css:
  227. return
  228. css = css.replace('\r', '').replace('\n ', '\n')
  229. with open_text(theme_dir, 'style.css') as f:
  230. f.write(css + '\n')
  231. class TumblrBackup:
  232. def __init__(self):
  233. self.total_count = 0
  234. def build_index(self):
  235. for f in glob(path_to(post_dir, '*.html')):
  236. post = LocalPost(f)
  237. self.index[post.tm.tm_year][post.tm.tm_mon].append(post)
  238. def save_index(self):
  239. f = glob(path_to(theme_dir, avatar_base + '.*'))
  240. avatar = os.path.split(f[0])[1] if f else None
  241. with open_text('index.html') as idx:
  242. idx.write(header(self.title, self.title, body_class='index',
  243. subtitle=self.subtitle, avatar=avatar
  244. ))
  245. for year in sorted(self.index.keys(), reverse=options.reverse_index):
  246. self.save_year(idx, year)
  247. idx.write('<p>Generated on %s.</p>\n' % strftime('%x %X'))
  248. def save_year(self, idx, year):
  249. idx.write('<h3>%s</h3>\n<ul>\n' % year)
  250. for month in sorted(self.index[year].keys(), reverse=options.reverse_index):
  251. tm = time.localtime(time.mktime([year, month, 3, 0, 0, 0, 0, 0, -1]))
  252. month_name = self.save_month(year, month, tm)
  253. idx.write(' <li><a href=%s/%s title="%d post(s)">%s</a></li>\n' % (
  254. archive_dir, month_name, len(self.index[year][month]),
  255. strftime('%B', tm)
  256. ))
  257. idx.write('</ul>\n\n')
  258. def save_month(self, year, month, tm):
  259. file_name = '%d-%02d.html' % (year, month)
  260. with open_text(archive_dir, file_name) as arch:
  261. arch.write('\n\n'.join([
  262. header(self.title, strftime('%B %Y', tm), body_class='archive'),
  263. '\n'.join(p.get_post() for p in sorted(
  264. self.index[year][month], key=lambda x: x.date, reverse=options.reverse_month
  265. )),
  266. '<p><a href=../ rel=contents>Index</a></p>\n'
  267. ]))
  268. return file_name
  269. def backup(self, account):
  270. """makes single files and an index for every post on a public Tumblr blog account"""
  271. base = get_api_url(account)
  272. # make sure there are folders to save in
  273. global save_folder, image_folder, post_ext, post_dir, have_custom_css
  274. if options.blosxom:
  275. save_folder = root_folder
  276. post_ext = '.txt'
  277. post_dir = os.curdir
  278. post_class = BlosxomPost
  279. else:
  280. save_folder = join(root_folder, account)
  281. image_folder = path_to(image_dir)
  282. post_class = TumblrPost
  283. have_custom_css = os.access(path_to(custom_css), os.R_OK)
  284. mkdir(save_folder, True)
  285. self.post_count = 0
  286. # prepare the period start and end timestamps
  287. if options.period:
  288. i = 0; tm = [int(options.period[:4]), 1, 1, 0, 0, 0, 0, 0, -1]
  289. if len(options.period) >= 6:
  290. i = 1; tm[1] = int(options.period[4:6])
  291. if len(options.period) == 8:
  292. i = 2; tm[2] = int(options.period[6:8])
  293. p_start = time.mktime(tm)
  294. tm[i] += 1
  295. p_stop = time.mktime(tm)
  296. # get the highest post id already saved
  297. ident_max = None
  298. if options.incremental:
  299. try:
  300. ident_max = max(
  301. long(os.path.splitext(os.path.split(f)[1])[0])
  302. for f in glob(path_to(post_dir, '*' + post_ext))
  303. )
  304. log(account, "Backing up posts after %d\r" % ident_max)
  305. except ValueError: # max() arg is an empty sequence
  306. pass
  307. else:
  308. log(account, "Getting basic information\r")
  309. # start by calling the API with just a single post
  310. soup = xmlparse(base + '?num=1')
  311. if not soup:
  312. return
  313. # collect all the meta information
  314. tumblelog = soup.tumblelog
  315. try:
  316. self.title = escape(tumblelog('title'))
  317. except KeyError:
  318. self.title = account
  319. self.subtitle = unicode(tumblelog)
  320. # use the meta information to create a HTML header
  321. global post_header
  322. post_header = header(self.title, body_class='post')
  323. # find the total number of posts
  324. total_posts = options.count or int(soup.posts('total'))
  325. last_post = options.skip + total_posts
  326. def _backup(posts):
  327. for p in sorted(posts, key=lambda x: long(x('id')), reverse=True):
  328. post = post_class(p)
  329. if ident_max and long(post.ident) <= ident_max:
  330. return False
  331. if options.period:
  332. if post.date >= p_stop:
  333. continue
  334. if post.date < p_start:
  335. return False
  336. post.generate_content()
  337. if post.error:
  338. sys.stderr.write('%s%s\n' % (post.error, 50 * ' '))
  339. post.save_post()
  340. self.post_count += 1
  341. return True
  342. # Get the XML entries from the API, which we can only do for max 50 posts at once.
  343. # Posts "arrive" in reverse chronological order. Post #0 is the most recent one.
  344. MAX = 50
  345. for i in range(options.skip, last_post, MAX):
  346. # find the upper bound
  347. j = min(i + MAX, last_post)
  348. log(account, "Getting posts %d to %d of %d\r" % (i, j - 1, total_posts))
  349. # silent exception escape hatch added by Gins 2018.12.03
  350. try:
  351. soup = xmlparse('%s?num=%d&start=%d' % (base, j - i, i))
  352. except:
  353. pass
  354. if soup is None:
  355. return
  356. if not _backup(soup.posts['post':]):
  357. break
  358. if not options.blosxom and self.post_count:
  359. get_avatar()
  360. get_style()
  361. if not have_custom_css:
  362. save_style()
  363. self.index = defaultdict(lambda: defaultdict(list))
  364. self.build_index()
  365. self.save_index()
  366. log(account, "%d posts backed up\n" % self.post_count)
  367. self.total_count += self.post_count
  368. class TumblrPost:
  369. def __init__(self, post):
  370. self.content = ''
  371. self.post = post
  372. self.xml_content = post.__repr__(1, 1)
  373. self.ident = post('id')
  374. self.url = post('url')
  375. self.typ = post('type')
  376. self.date = int(post('unix-timestamp'))
  377. self.tm = time.localtime(self.date)
  378. self.title = ''
  379. self.tags = []
  380. self.file_name = self.ident + post_ext
  381. self.error = None
  382. def generate_content(self):
  383. """generates the content for this post"""
  384. post = self.post
  385. content = []
  386. def append(s, fmt=u'%s'):
  387. # the %s conversion calls unicode() on the xmltramp element
  388. content.append(fmt % s)
  389. def get_try(elt):
  390. try:
  391. return unicode(post[elt])
  392. except KeyError:
  393. return ''
  394. def append_try(elt, fmt=u'%s'):
  395. elt = get_try(elt)
  396. if elt:
  397. append(elt, fmt)
  398. if self.typ == 'regular':
  399. self.title = get_try('regular-title')
  400. append_try('regular-body')
  401. elif self.typ == 'photo':
  402. url = escape(get_try('photo-link-url'))
  403. for p in post.photoset['photo':] if hasattr(post, 'photoset') else [post]:
  404. src = unicode(p['photo-url'])
  405. append(escape(self.get_image_url(src)), u'<img alt="" src="%s">')
  406. if url:
  407. content[-1] = '<a href="%s">%s</a>' % (url, content[-1])
  408. content[-1] = '<p>' + content[-1] + '</p>'
  409. if p._name == 'photo' and p('caption'):
  410. append(p('caption'), u'<p>%s</p>')
  411. append_try('photo-caption')
  412. elif self.typ == 'link':
  413. url = unicode(post['link-url'])
  414. self.title = u'<a href="%s">%s</a>' % (escape(url),
  415. post['link-text'] if 'link-text' in post else url
  416. )
  417. append_try('link-description')
  418. elif self.typ == 'quote':
  419. append(post['quote-text'], u'<blockquote><p>%s</p></blockquote>')
  420. append_try('quote-source', u'<p>%s</p>')
  421. elif self.typ == 'video':
  422. source = unicode(post['video-source']).strip()
  423. if source.startswith('<'):
  424. player = source
  425. source = ''
  426. else:
  427. player = unicode(post['video-player']).strip()
  428. player = player.replace('src="//', 'src="http://')
  429. append(player)
  430. append_try('video-caption')
  431. if '//' in source:
  432. append(escape(source), u'<p><a href="%s">Original</a></p>')
  433. elif self.typ == 'audio':
  434. append(post['audio-player'])
  435. append_try('audio-caption')
  436. elif self.typ == 'answer':
  437. self.title = post.question
  438. append(post.answer)
  439. elif self.typ == 'conversation':
  440. self.title = get_try('conversation-title')
  441. append(
  442. '<br>\n'.join(escape(unicode(l)) for l in post.conversation['line':]),
  443. u'<p>%s</p>'
  444. )
  445. else:
  446. self.error = u"Unknown post type '%s' in post #%s" % (self.typ, self.ident)
  447. append(escape(self.xml_content), u'<pre>%s</pre>')
  448. self.tags = [u'%s' % t for t in post['tag':]]
  449. self.content = '\n'.join(content)
  450. # fix wrongly nested HTML tags
  451. for p in ('<p>(<(%s)>)', '(</(%s)>)</p>'):
  452. self.content = re.sub(p % 'p|ol|iframe[^>]*', r'\1', self.content)
  453. def get_image_url(self, url):
  454. return save_image(url)
  455. def get_post(self):
  456. """returns this post in HTML"""
  457. post = post_header + '<article class=%s id=p-%s>\n' % (self.typ, self.ident)
  458. post += '<p class=meta><span class=date>%s</span>\n' % strftime('%x %X', self.tm)
  459. post += u'<a class=llink href=../%s/%s>¶</a>\n' % (post_dir, self.file_name)
  460. post += u'<a href=%s rel=canonical>●</a></p>\n' % self.url
  461. if self.title:
  462. post += '<h2>%s</h2>\n' % self.title
  463. post += self.content
  464. if self.tags:
  465. post += u'\n<p class=tags>%s</p>' % u' '.join(u'#' + t for t in self.tags)
  466. post += '\n</article>\n'
  467. return post
  468. def save_post(self):
  469. """saves this post locally"""
  470. with open_text(post_dir, self.file_name) as f:
  471. f.write(self.get_post())
  472. os.utime(path_to(post_dir, self.file_name),
  473. (self.date, self.date)
  474. )
  475. if options.xml:
  476. with open_text(xml_dir, self.ident + '.xml') as f:
  477. f.write(self.xml_content)
  478. class BlosxomPost(TumblrPost):
  479. def get_image_url(self, url):
  480. return url
  481. def get_post(self):
  482. """returns this post as a Blosxom post"""
  483. post = self.title + '\nmeta-id: _' + self.ident + '\nmeta-url: ' + self.url
  484. if self.tags:
  485. post += '\nmeta-tags: ' + ' '.join(t.replace(' ', '+') for t in self.tags)
  486. post += '\n\n' + self.content
  487. return post
  488. class LocalPost:
  489. def __init__(self, post_file):
  490. with codecs.open(post_file, 'r', encoding) as f:
  491. self.lines = f.readlines()
  492. # remove header and footer
  493. while self.lines and '<article ' not in self.lines[0]:
  494. del self.lines[0]
  495. while self.lines and '</article>' not in self.lines[-1]:
  496. del self.lines[-1]
  497. self.file_name = os.path.split(post_file)[1]
  498. self.ident = os.path.splitext(self.file_name)[0]
  499. self.date = os.stat(post_file).st_mtime
  500. self.tm = time.localtime(self.date)
  501. def get_post(self):
  502. return u''.join(self.lines)
  503. if __name__ == '__main__':
  504. import optparse
  505. parser = optparse.OptionParser("Usage: %prog [options] blog-name ...",
  506. description="Makes a local backup of Tumblr blogs."
  507. )
  508. parser.add_option('-q', '--quiet', action='store_true',
  509. help="suppress progress messages"
  510. )
  511. parser.add_option('-i', '--incremental', action='store_true',
  512. help="incremental backup mode"
  513. )
  514. parser.add_option('-x', '--xml', action='store_true',
  515. help="save the original XML source"
  516. )
  517. parser.add_option('-b', '--blosxom', action='store_true',
  518. help="save the posts in blosxom format"
  519. )
  520. parser.add_option('-r', '--reverse-month', action='store_false', default=True,
  521. help="reverse the post order in the monthly archives"
  522. )
  523. parser.add_option('-R', '--reverse-index', action='store_false', default=True,
  524. help="reverse the index file order"
  525. )
  526. parser.add_option('-a', '--auto', type='int', metavar="HOUR",
  527. help="do a full backup at HOUR hours, otherwise do an incremental backup"
  528. " (useful for cron jobs)"
  529. )
  530. parser.add_option('-n', '--count', type='int', help="save only COUNT posts")
  531. parser.add_option('-s', '--skip', type='int', default=0,
  532. help="skip the first SKIP posts"
  533. )
  534. parser.add_option('-p', '--period', help="limit the backup to PERIOD"
  535. " ('y', 'm', 'd' or YYYY[MM[DD]])"
  536. )
  537. parser.add_option('-P', '--private', help="password for a private tumblr",
  538. metavar='PASSWORD'
  539. )
  540. options, args = parser.parse_args()
  541. if options.auto is not None:
  542. if options.auto == time.localtime().tm_hour:
  543. options.incremental = False
  544. else:
  545. options.incremental = True
  546. if options.period:
  547. try:
  548. options.period = time.strftime(
  549. {'y': '%Y', 'm': '%Y%m', 'd': '%Y%m%d'}[options.period]
  550. )
  551. except KeyError:
  552. options.period = options.period.replace('-', '')
  553. if len(options.period) not in (4, 6, 8):
  554. parser.error("Period must be 'y', 'm', 'd' or YYYY[MM[DD]]")
  555. if not args:
  556. args = ['staff']
  557. tb = TumblrBackup()
  558. for account in args:
  559. tb.backup(account)
  560. sys.exit(0 if tb.total_count else 1)

Tumblr backup script, originally from Greymask—original post is MIA as of 2018/12/03.

Fixing some smart emdashes that got inserted by my autocorrect rules; added a link to xmltramp; updated comment with better instructions.