Tumblr Export Converter

Update: use [GitHub] tumblr-utils for a better way of archiving tumblr!
I made a short python script to covert the default tumblr export .zip file into a more
usable folder structure, with each post with it's relevant media seperated into it's
own folder (with a .meta json format metadata file).

It requires BeautifulSoup 4 (version used is 4.10.0) and was made using python 3.10.7
Here's the code:

import re
import os
import shutil
import zipfile
from bs4 import BeautifulSoup

# Tumblr Export Converter  by  max74.25  (https://cyborgcatboy.neocities.org/projects/tumblr-conv.html#)
# 
# Tumblr Export Converter © 2022 by max74.25 is licensed under CC BY-NC-SA 4.0 
# You can find a copy of the license at https://creativecommons.org/licenses/by-nc-sa/4.0/

class TumblrExportConverter:
    current_directory = ""
    extracted_base_dir = ""
    finished_posts = []
    final_dir = ""

    def search(self):
        self.current_directory = os.path.dirname(os.path.realpath(__file__))
        print("In directory: " + str(self.current_directory) + "\n")

        # find the zip files in the current directory
        to_process = []

        print("Found archives:")
        for file in os.listdir(self.current_directory):
            # find the files in the current directory that are zip files and append them to a list
            if os.path.isfile(file):
                if file.split('.')[-1] == 'zip':
                    print(file)
                    to_process.append(file)
        print("\n")

        self.process_zip_files(to_process)

    def process_zip_files(self, input_zip_files):
        for zip_file in input_zip_files:
            index_html = '''<!Doctype html><html><head><title>Tumblr Archive</title></head><style>body {
font: 18px Helvetica, Arial, sans-serif;
width: 500px;
margin: 50px auto;
}

a {
color: inherit;
}

body img {
max-width: 100%;
}

blockquote {
margin: 10px 0px 10px 10px;
padding-left: 15px;
border-left: 4px solid #dcdcdc;
}

.caption {
margin-top: 20px;
font-size: 14px;
}

#reblog_info {
margin: 20px 0;
padding: 10px 0;
border-bottom: 1px solid #ccc;
font: 11px 'Lucida Grande', Arial, sans-serif;
color: #888;
}

#footer {
margin: 20px 0;
padding: 10px 0;
border-top: 1px solid #ccc;
font: 11px 'Lucida Grande', Arial, sans-serif;
color: #888;
}

#footer #timestamp,
#footer .tag {
    display: inline-block;
    margin-right: 10px;
}"</style><body>'''  # ive used the default style css from the tumblr archive package

            self.extracted_base_dir = os.path.join(self.current_directory, zip_file.replace('.zip', '.extract'))

            print("Extracting " + str(zip_file) + " to " + str(self.extracted_base_dir) + "...")

            # unzip them
            with zipfile.ZipFile(zip_file, 'r') as zip_ref:
                zip_ref.extractall(self.extracted_base_dir)

            # enter the directory and unzip the posts file
            with zipfile.ZipFile(os.path.join(self.extracted_base_dir, 'posts.zip'), 'r') as zip_ref:
                zip_ref.extractall(os.path.join(self.extracted_base_dir, 'posts'))

            # directory structure
            # self.extracted_base_dir
            #     |- media
            #     |    \ ...
            #     |- posts
            #     |    \ ...
            #     \- posts.zip

            index_html += self.process_posts(zip_file, index_html)

            # create an archive index file full of all the posts (commented bc web browser will not be able to load any
            # blog larger than ~200 posts (it loads *all*  the blog media at once << eats ram better than chrome))
            index_html += '''</body></html>'''
            # with open(os.path.join(self.current_directory, str(zip_file.split('.')[0]) + '.html'), 'w') as index_file:
            # index_file.write(index_html)

            print('Finished ' + str(zip_file) + ". " + str(len(self.finished_posts)) + " post(s) converted\n\n")

            # clean up
            shutil.rmtree(self.extracted_base_dir)
            self.final_dir_rename = os.path.join(self.current_directory, zip_file.split('.')[0])
            try:
                os.rename(self.final_dir, self.final_dir_rename)
            except Exception as e:
                # if e == os.error:
                shutil.rmtree(self.final_dir_rename)
                os.rename(self.final_dir, self.final_dir_rename)

    def process_posts(self, zip_file, index_html):
        # scan all the posts & media files
        post_files = os.listdir(os.path.join(self.extracted_base_dir, 'posts/html'))
        media_files = os.listdir(os.path.join(self.extracted_base_dir, 'media'))

        # create a new finished directory
        self.final_dir = os.path.join(self.current_directory,
                                      str(zip_file.split('.')[0]) + '-processed')  # this will be renamed later

        self.finished_posts = []

        print('Converting posts...')

        for post_file in post_files:
            post_id = post_file.replace('.html', '')

            with open(os.path.join(self.extracted_base_dir, 'posts/html', post_file), 'r') as post_file:
                post = post_file.read()

            soup = BeautifulSoup(post, 'lxml')

            author = soup.find('a', class_='tumblr_blog')
            if author is not None:
                author = author.text
            else:
                author = "None"

            soup_tags = soup.find_all('span', class_='tag')
            tags = []
            for tag in soup_tags:
                tags.append({
                    'tag_name': tag.text, 'tag_type': 'tumblr_tag'
                })

            post_url = soup.find('a', class_='tumblr_blog')
            if post_url is not None:
                post_url = post_url['href']
            else:
                post_url = ""

            html_contents = soup.find('body')  # this needs to be the outermost blockquote << test with reblogged posts

            # each post is its own directory << can use post ID
            post_dir = os.path.join(self.final_dir, post_id)
            os.makedirs(post_dir, exist_ok=True)

            # map the media to the posts
            relevant_media = []

            for media in media_files:
                if post_id in media:
                    relevant_media.append(media)
                    # media moved to the relevant post directory
                    os.rename(os.path.join(self.extracted_base_dir, 'media', media),
                              os.path.join(self.final_dir, post_id, media))

            html_contents_finished = self.map_media(post_id, html_contents, relevant_media)

            html_contents_finished = html_contents_finished.replace('<body>', '<div class="post">').replace('</body>',
                                                                                                            '</div>')

            post_filename = os.path.join(post_dir, 'post.html')
            with open(post_filename, 'w') as post_file:
                post_file.write(html_contents_finished.replace('{{root}}', ''))

            timestamp = soup.find('span', id='timestamp').text

            post_name = soup.text.replace('\n', '').replace(timestamp, '')[:255]
            if post_name == '':
                post_name = 'untitled tumblr post'

            # save as .meta in post dir
            json_data = {
                'name': post_name,
                'obj_type': 'tumblr_post:' + post_id,
                'duration': len(soup.text.split()),
                'source_url': post_url,
                'source_id': post_id,
                'creators': author,
                'tags': tags,
                'files_path': str(zip_file.split('.')[0] + '/' + str(post_id))
            }

            meta_filename = os.path.join(post_dir, '.meta')
            with open(meta_filename, 'w') as meta_file:
                meta_file.write(str(json_data))

            self.finished_posts.append(str(post_filename))

            index_html += "\n\n" + str(html_contents_finished).replace('{{root}}',
                                                                       str(zip_file.split('.')[0]) + '/' + str(
                                                                           post_id) + '/')
        return index_html

    def map_media(self, post_id, html_contents, relevant_media):
        # edit the post contents to use the saved media
        # first find all the media tags in the content >> audio, video, images
        post_media = html_contents.find_all(['img', 'figure', 'embed', 'video', 'audio'])

        html_contents_finished = str(html_contents)

        numbers = []  # this var is what order the media is
        numbers_ext = []  # this var is what types of files the media is, in order
        for m in relevant_media:
            if "_" not in m:
                break
            numbers.append(m.replace(post_id + "_", "").replace('.' + m.split('.')[-1], ''))
            numbers_ext.append(m.replace(post_id + "_", ""))

        i = 0
        completed_tags = []
        if relevant_media is not None and post_media is not None:
            for post_media_tag in post_media:
                for media_tag in post_media_tag.find_all(src=True):
                    if media_tag.contents is not None and media_tag not in completed_tags:
                        if 'media.tumblr.com' in media_tag['src']:
                            # find whatever source is there
                            html_contents_finished = html_contents_finished.replace(str(media_tag['src']),
                                                                                    "{{root}}" + str(relevant_media[i]))
                            # the {{root}} tag is there so the reference to the media can be accessed *relative* to the
                            # path
                            completed_tags.append(media_tag)
                            i += 1

        return html_contents_finished


p = TumblrExportConverter()
p.search()
Tumblr Export Converter by max74.25 is licensed under CC BY-NC-SA 4.0