Front loading a wordpress blog from instagram

My fiancé, like many others, has become jaded with the current social media landscape, and after a bit of nudging from a professional nerd has decided to set up her own little corner of the internet. Part of her site will be a blog - ostensibly a log of her experiments with new techniques and materials in her studio practice.

The problem, as with any new project, was the blank page problem. She asked if it was possible to take the years of content she’s built up on her instagram and migrate it to her new blog. The following is a description of the bag of hacks I put together to achieve this.

Stage 1, grab the content

First we needed to grab all the content from her existing instagram profile. There are a bunch of tools that claim to do this, but the best I found was instaLoader, a command line tool that not only downloads the images from any public account, but also the captions, hashtags, comments etc.

pip3 install instaloader
instaloader profile YOURPROFILEHERE

Let this run for a few minutes (which runs at a reasonable rate as to not get nerfed by any instagram rate limits), and you’ll soon have a directory with a bunch of jpegs and json files that describe each post.

Stage 2, get a wordpress access token

To us the API endpoints we’re going to hit, in order to automate the creation of the posts, we are going to need an oauth access token.

MEDIA_ENDPOINT = f"https://public-api.wordpress.com/rest/v1.1/sites/{WORDPRESS_SITE}/media/new"
POSTS_ENDPOINT = f"https://public-api.wordpress.com/rest/v1.1/sites/{WORDPRESS_SITE}/posts/new"

In order to get a token (bearing in mind that as described above, this entire thing is a bag of hacks);

  1. Go to https://developer.wordpress.com/apps/
    Create an app and note:

    • client_id

    • client_secret

    • redirect_uri (you can use https://localhost for testing)

  2. Construct a browser URL like this (replace YOUR_CLIENT_ID and YOUR_REDIRECT_URI):

    https://public-api.wordpress.com/oauth2/authorize?client_id=YOUR_CLIENT_ID&redirect_uri=YOUR_REDIRECT_URI&response_type=code

    Open that URL in your browser, log in, approve the app, and it’ll redirect you to something like:

    https://localhost/?code=abcd1234

  3. Copy that code value and exchange it for a token:

    curl -X POST https://public-api.wordpress.com/oauth2/token \
        -d "client_id=YOUR_CLIENT_ID" \  
        -d "client_secret=YOUR_CLIENT_SECRET" \   
        -d "redirect_uri=YOUR_REDIRECT_URI" \   
        -d "grant_type=authorization_code" \   
        -d "code=abcd1234"
    

    You’ll get JSON with your access token.

That token can then be reused for your upload and post-creation operations.

Step 3, assemble to wordpress posts

I’m using python here, but obviously use whatever you like:

import os, json, requests, traceback
from datetime import datetime

ACCESS_TOKEN = "YOUR_OAUTH_TOKEN"
WORDPRESS_SITE = "your-site.wordpress.com"
DATA_DIR = "./instagram_data"

HEADERS = {"Authorization": f"Bearer {ACCESS_TOKEN}"}
MEDIA_ENDPOINT = f"https://public-api.wordpress.com/rest/v1.1/sites/{WORDPRESS_SITE}/media/new"
POSTS_ENDPOINT = f"https://public-api.wordpress.com/rest/v1.1/sites/{WORDPRESS_SITE}/posts/new"

def get_image_path(json_filename):
    base = json_filename.replace(".json", "")
    jpg = os.path.join(DATA_DIR, f"{base}.jpg")
    if os.path.exists(jpg):
        return [jpg]
    images, n = [], 1
    while True:
        alt = os.path.join(DATA_DIR, f"{base}_{n}.jpg")
        if os.path.exists(alt):
            images.append(alt)
            n += 1
        else:
            break
    return images or None

def upload_image(media_path):
    try:
        with open(media_path, "rb") as f:
            r = requests.post(MEDIA_ENDPOINT, headers=HEADERS, files={"media[]": f})
        if r.status_code not in (200, 201):
            print(f"✗ Upload failed: {r.status_code} {r.text}")
            return None
        data = r.json()
        media = data.get("media", [])
        if media:
            url = media[0].get("URL")
            print(f"✓ Uploaded: {os.path.basename(media_path)}{url}")
            return url
        print("✗ No media URL in response")
        return None
    except Exception as e:
        print(f"✗ Error uploading {media_path}: {e}")
        return None

def create_post(title, content, media_urls, post_date):
    date_str = datetime.fromtimestamp(post_date).isoformat()
    body = "".join([f'<img src="{u}" />' for u in media_urls]) + f"<p>{content}</p>"
    payload = {"title": title, "content": body, "status": "publish", "date": date_str}
    try:
        r = requests.post(POSTS_ENDPOINT, headers=HEADERS, json=payload)
        if r.status_code in (200, 201):
            print(f"✓ Created post: {r.json().get('URL')}")
            return True
        print(f"✗ Failed to create post: {r.status_code} {r.text}")
        return False
    except Exception as e:
        print(f"✗ Error creating post: {e}")
        return False

def process_json(json_file):
    path = os.path.join(DATA_DIR, json_file)
    print(f"\n📸 Processing {json_file}")
    try:
        with open(path, "r", encoding="utf-8") as f:
            data = json.load(f)
        node = data.get("node", {})
        iphone = node.get("iphone_struct", {})
        caption = iphone.get("caption")
        if caption is None:
            caption = "Posted from Instagram"
        elif isinstance(caption, dict):
            caption = caption.get("text", "Posted from Instagram")
        post_date = node.get("date")
        if not post_date:
            print("✗ No date found")
            return False
        title = f"Instagram Post - {datetime.fromtimestamp(post_date).strftime('%B %d, %Y')}"
        images = get_image_path(json_file)
        if not images:
            print("✗ No matching images found")
            return False
        urls = []
        for img in images:
            url = upload_image(img)
            if not url:
                return False
            urls.append(url)
        return create_post(title, caption, urls, post_date)
    except Exception as e:
        print(f"✗ Error: {e}")
        print(traceback.format_exc())
        return False

def main():
    files = sorted([f for f in os.listdir(DATA_DIR) if f.endswith(".json")])
    print(f"Found {len(files)} Instagram posts")
    success, fail = 0, 0
    for f in files:
        if process_json(f): success += 1
        else: fail += 1
    print(f"\nDone — Success: {success}, Failures: {fail}")

if __name__ == "__main__":
    main()


This works great, and over the course of about 5 minutes uploaded all the images to wordpress, and created blog posts with the correct timestamps, giving this brand new blog a catalog of posts going back over a number of years.

There was however, one problem.

Step 4, dealing with Gutenberg Blocks

The script above creates a bunch of posts, but I forgot that wordpress uses Gutenberg blocks for layout these days. Because the script just creates posts with a bunch of raw tags, the images looked awful, having been basically ignored by the wordpress theme styling. Rather than delete all the posts and update the original script, the script below crawls across an existing wordpress site and converts any tags it finds to the block format. BeautifulSoup might be overkill for this task, but it works and it works well.

Note, the get_posts endpoint doesn’t like to return more than 100 posts at a time, so we process them in batches.

import requests, time, traceback, re, os
from bs4 import BeautifulSoup

ACCESS_TOKEN = "YOUR_OAUTH_TOKEN"
WORDPRESS_SITE = "your-site.wordpress.com"
HEADERS = {"Authorization": f"Bearer {ACCESS_TOKEN}"}
POSTS_ENDPOINT = f"https://public-api.wordpress.com/rest/v1.1/sites/{WORDPRESS_SITE}/posts"

def get_all_posts():
    posts, page = [], 1
    while True:
        url = f"{POSTS_ENDPOINT}?number=100&status=publish&page={page}"
        r = requests.get(url, headers=HEADERS)
        if r.status_code != 200: break
        data = r.json().get("posts", [])
        if not data: break
        posts.extend(data)
        page += 1
        time.sleep(0.2)
    print(f" Total posts fetched: {len(posts)}")
    return posts

def convert_to_blocks(html):
    soup = BeautifulSoup(html, "html.parser")
    blocks = []
    for img in soup.find_all("img"):
        url = img.get("src")
        if url:
            blocks.append(
                f"<!-- wp:image -->\n<figure class='wp-block-image'><img src='{url}' alt=''/></figure>\n<!-- /wp:image -->"
            )
    for p in soup.find_all("p"):
        text = p.get_text(strip=True)
        if text:
            blocks.append(
                f"<!-- wp:paragraph -->\n<p>{text}</p>\n<!-- /wp:paragraph -->"
            )
    if not blocks:
        return html, False
    return "\n\n".join(blocks), True

def update_post(post_id, title, content):
    url = f"{POSTS_ENDPOINT}/{post_id}"
    r = requests.post(url, headers=HEADERS, json={"content": content})
    if r.status_code in (200, 201):
        print(f" Updated post {post_id}: {title[:30]}...")
        return True
    print(f" Failed to update {post_id}: {r.status_code}")
    return False

def main():
    print("Converting posts to Gutenberg blocks...")
    posts = get_all_posts()
    success = fail = skip = 0
    for i, p in enumerate(posts, 1):
        pid, title, html = p.get("ID"), p.get("title"), p.get("content")
        print(f"\nProcessing {i}/{len(posts)}: {title[:40]}...")
        new_html, changed = convert_to_blocks(html)
        if not changed:
            print("Skipped (no paragraphs/images)")
            skip += 1
            continue
        if update_post(pid, title, new_html):
            success += 1
        else:
            fail += 1
        time.sleep(0.5)
    print(f"\n Done. Success: {success}, Skipped: {skip}, Failed: {fail}")

if __name__ == "__main__":
    main()

Step 5, bask in glory and don’t forget to be your persons first RSS subscriber

With her new blog up and running, I’m super excited to see what she gets up to over there. I added her RSS feed to netnewswire, and I can’t wait for the first new post to show up!