Tumblr Export Converter
I made a short python script to covert the default tumblr export .zip file into a moreusable folder structure, with each post with it's relevant media seperated into it's
own folder (with a .meta json format metadata file).
It requires BeautifulSoup 4 (version used is 4.10.0) and was made using python 3.10.7
Here's the code:
import re import os import shutil import zipfile from bs4 import BeautifulSoup # Tumblr Export Converter by max74.25 (https://cyborgcatboy.neocities.org/projects/tumblr-conv.html#) # # Tumblr Export Converter © 2022 by max74.25 is licensed under CC BY-NC-SA 4.0 # You can find a copy of the license at https://creativecommons.org/licenses/by-nc-sa/4.0/ class TumblrExportConverter: current_directory = "" extracted_base_dir = "" finished_posts = [] final_dir = "" def search(self): self.current_directory = os.path.dirname(os.path.realpath(__file__)) print("In directory: " + str(self.current_directory) + "\n") # find the zip files in the current directory to_process = [] print("Found archives:") for file in os.listdir(self.current_directory): # find the files in the current directory that are zip files and append them to a list if os.path.isfile(file): if file.split('.')[-1] == 'zip': print(file) to_process.append(file) print("\n") self.process_zip_files(to_process) def process_zip_files(self, input_zip_files): for zip_file in input_zip_files: index_html = '''<!Doctype html><html><head><title>Tumblr Archive</title></head><style>body { font: 18px Helvetica, Arial, sans-serif; width: 500px; margin: 50px auto; } a { color: inherit; } body img { max-width: 100%; } blockquote { margin: 10px 0px 10px 10px; padding-left: 15px; border-left: 4px solid #dcdcdc; } .caption { margin-top: 20px; font-size: 14px; } #reblog_info { margin: 20px 0; padding: 10px 0; border-bottom: 1px solid #ccc; font: 11px 'Lucida Grande', Arial, sans-serif; color: #888; } #footer { margin: 20px 0; padding: 10px 0; border-top: 1px solid #ccc; font: 11px 'Lucida Grande', Arial, sans-serif; color: #888; } #footer #timestamp, #footer .tag { display: inline-block; margin-right: 10px; }"</style><body>''' # ive used the default style css from the tumblr archive package self.extracted_base_dir = os.path.join(self.current_directory, zip_file.replace('.zip', '.extract')) print("Extracting " + str(zip_file) + " to " + str(self.extracted_base_dir) + "...") # unzip them with zipfile.ZipFile(zip_file, 'r') as zip_ref: zip_ref.extractall(self.extracted_base_dir) # enter the directory and unzip the posts file with zipfile.ZipFile(os.path.join(self.extracted_base_dir, 'posts.zip'), 'r') as zip_ref: zip_ref.extractall(os.path.join(self.extracted_base_dir, 'posts')) # directory structure # self.extracted_base_dir # |- media # | \ ... # |- posts # | \ ... # \- posts.zip index_html += self.process_posts(zip_file, index_html) # create an archive index file full of all the posts (commented bc web browser will not be able to load any # blog larger than ~200 posts (it loads *all* the blog media at once << eats ram better than chrome)) index_html += '''</body></html>''' # with open(os.path.join(self.current_directory, str(zip_file.split('.')[0]) + '.html'), 'w') as index_file: # index_file.write(index_html) print('Finished ' + str(zip_file) + ". " + str(len(self.finished_posts)) + " post(s) converted\n\n") # clean up shutil.rmtree(self.extracted_base_dir) self.final_dir_rename = os.path.join(self.current_directory, zip_file.split('.')[0]) try: os.rename(self.final_dir, self.final_dir_rename) except Exception as e: # if e == os.error: shutil.rmtree(self.final_dir_rename) os.rename(self.final_dir, self.final_dir_rename) def process_posts(self, zip_file, index_html): # scan all the posts & media files post_files = os.listdir(os.path.join(self.extracted_base_dir, 'posts/html')) media_files = os.listdir(os.path.join(self.extracted_base_dir, 'media')) # create a new finished directory self.final_dir = os.path.join(self.current_directory, str(zip_file.split('.')[0]) + '-processed') # this will be renamed later self.finished_posts = [] print('Converting posts...') for post_file in post_files: post_id = post_file.replace('.html', '') with open(os.path.join(self.extracted_base_dir, 'posts/html', post_file), 'r') as post_file: post = post_file.read() soup = BeautifulSoup(post, 'lxml') author = soup.find('a', class_='tumblr_blog') if author is not None: author = author.text else: author = "None" soup_tags = soup.find_all('span', class_='tag') tags = [] for tag in soup_tags: tags.append({ 'tag_name': tag.text, 'tag_type': 'tumblr_tag' }) post_url = soup.find('a', class_='tumblr_blog') if post_url is not None: post_url = post_url['href'] else: post_url = "" html_contents = soup.find('body') # this needs to be the outermost blockquote << test with reblogged posts # each post is its own directory << can use post ID post_dir = os.path.join(self.final_dir, post_id) os.makedirs(post_dir, exist_ok=True) # map the media to the posts relevant_media = [] for media in media_files: if post_id in media: relevant_media.append(media) # media moved to the relevant post directory os.rename(os.path.join(self.extracted_base_dir, 'media', media), os.path.join(self.final_dir, post_id, media)) html_contents_finished = self.map_media(post_id, html_contents, relevant_media) html_contents_finished = html_contents_finished.replace('<body>', '<div class="post">').replace('</body>', '</div>') post_filename = os.path.join(post_dir, 'post.html') with open(post_filename, 'w') as post_file: post_file.write(html_contents_finished.replace('{{root}}', '')) timestamp = soup.find('span', id='timestamp').text post_name = soup.text.replace('\n', '').replace(timestamp, '')[:255] if post_name == '': post_name = 'untitled tumblr post' # save as .meta in post dir json_data = { 'name': post_name, 'obj_type': 'tumblr_post:' + post_id, 'duration': len(soup.text.split()), 'source_url': post_url, 'source_id': post_id, 'creators': author, 'tags': tags, 'files_path': str(zip_file.split('.')[0] + '/' + str(post_id)) } meta_filename = os.path.join(post_dir, '.meta') with open(meta_filename, 'w') as meta_file: meta_file.write(str(json_data)) self.finished_posts.append(str(post_filename)) index_html += "\n\n" + str(html_contents_finished).replace('{{root}}', str(zip_file.split('.')[0]) + '/' + str( post_id) + '/') return index_html def map_media(self, post_id, html_contents, relevant_media): # edit the post contents to use the saved media # first find all the media tags in the content >> audio, video, images post_media = html_contents.find_all(['img', 'figure', 'embed', 'video', 'audio']) html_contents_finished = str(html_contents) numbers = [] # this var is what order the media is numbers_ext = [] # this var is what types of files the media is, in order for m in relevant_media: if "_" not in m: break numbers.append(m.replace(post_id + "_", "").replace('.' + m.split('.')[-1], '')) numbers_ext.append(m.replace(post_id + "_", "")) i = 0 completed_tags = [] if relevant_media is not None and post_media is not None: for post_media_tag in post_media: for media_tag in post_media_tag.find_all(src=True): if media_tag.contents is not None and media_tag not in completed_tags: if 'media.tumblr.com' in media_tag['src']: # find whatever source is there html_contents_finished = html_contents_finished.replace(str(media_tag['src']), "{{root}}" + str(relevant_media[i])) # the {{root}} tag is there so the reference to the media can be accessed *relative* to the # path completed_tags.append(media_tag) i += 1 return html_contents_finished p = TumblrExportConverter() p.search()
Tumblr Export Converter by max74.25 is licensed under CC BY-NC-SA 4.0