migrate-middleman-to-hugo/convert2.py

#!/bin/env python3
import re
import yaml
import shutil
import click
from datetime import datetime
from pathlib import Path

def migrate_header_img_and_return_new_path_for_post(img_path, img_new_path):
    is_http = re.compile(r'https?://')
    if is_http.match(img_path):
        return
    img_pathlib = Path(img_path)
    output_path = Path(img_new_path)
    if re.compile(r'/').match(img_path):
        old = str(Path(g_input).parent) + img_path
    else:
        old = g_input + str(output_path.parent.name) + '/' + str(output_path.name) + '/' + img_path
    new = img_new_path + '/' + img_pathlib.name
    shutil.copyfile(old, new)
    return new.replace(new.split('/')[0], '/blog')


def convert_m2h(input_file: str, output_dir: str):
    datetime_parse = re.compile(r"\b\d{4}-\d{2}-\d{2} \d{2}:\d{2}")

    # Read the YAML frontmatter
    with open(input_file, 'r') as file:
        contents = file.read()
        front_matter = re.findall(r'---(.*?)---', contents, re.DOTALL)[0]

    data = yaml.safe_load(front_matter)

    # Convert the fields
    if isinstance(data['date'], datetime):
        data['publishDate'] = data['date'].isoformat()
    if isinstance(data['date'], str):
        data['publishDate'] = date_checker(data['date'])
    del data['date']
    data['lastmod'] = data['publishDate']

    if 'tags' in data:
        if data['tags'] is None or data['tags'] == []:
            del data['tags']
        else:
            if not isinstance(data['tags'], list):
                tag_list = list()
                data['tags'] = data['tags'].split()
                for t in data.get('tags'):
                    tag_list.append(t.strip(','))
                data['tags'] = tag_list

    if 'ws' in data:
        data['categories'] = 'event'
        if 'where' in data['ws']:
            data['location'] = data['ws'].pop('where')
        if 'when' in data['ws']:
            new_date = data['ws']['when']
            if datetime_parse.match(str(new_date)):
                data['date'] = date_checker(new_date)

        del data['ws']
    else:
        data['categories'] = 'article'

    if 'header_image' in data:
        new_img_path = migrate_header_img_and_return_new_path_for_post(data['header_image'], output_dir)
        header = {'image': new_img_path, 'caption': 'Sorry this blog entry was migrated, there is no Alt-Text'}
        del data['header_image']
        data['header'] = header

    if 'author' in data:
        data['authors'] = data['author']['display_name'].split(', ')
        # delete fields
        del data['author']

    if 'meta' in data:
        del data['meta']
    if 'published' in data:
        del data['published']
    if 'status' in data:
        del data['status']
    if 'layout' in data:
        del data['layout']
    if 'type' in data:
        del data['type']

    data['draft'] = False

    old_not_front_matter = re.findall(r'---.*?---(.*)', contents, re.DOTALL)[0]

    pattern = re.compile("{.*}", re.DOTALL)
    new_not_front_matter = pattern.sub("", old_not_front_matter)

    # Create the new YAML frontmatter
    new_front_matter = '\n' + yaml.safe_dump(data)

    new_contents = contents.replace(front_matter, new_front_matter)
    new_contents = new_contents.replace(old_not_front_matter, new_not_front_matter)

    # Write the new contents to the file
    with open(output_dir + "/index.md", 'w') as file:
        file.write(new_contents)


def date_checker(new_date: str):
    dt_with_sek_tz = re.compile(r"\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2} .*")
    dt_with_sek = re.compile(r"\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}")
    dt_min_tz = re.compile(r"\d{4}-\d{2}-\d{2} \d{2}:\d{2} .*")
    dt_min = re.compile(r"\d{4}-\d{2}-\d{2} \d{2}:\d{2}")
    if dt_with_sek_tz.match(str(new_date)):
        return datetime.strptime(str(new_date), '%Y-%m-%d %H:%M:%S %Z').astimezone().isoformat()
    if dt_with_sek.match(str(new_date)):
        return datetime.strptime(str(new_date), '%Y-%m-%d %H:%M:%S').astimezone().isoformat()
    if dt_min_tz.match(str(new_date)):
        return datetime.strptime(str(new_date), '%Y-%m-%d %H:%M %Z').astimezone().isoformat()
    if dt_min.match(str(new_date)):
        return datetime.strptime(str(new_date), '%Y-%m-%d %H:%M').astimezone().isoformat()


def convert_year(year_path, output):
    pattern = re.compile("\d{4}-{1}\d{2}-{1}\d{2}-.*")
    postsOfYear = list(year_path.rglob('**/*.m*'))
    for post in postsOfYear:
        if pattern.match(post.parent.name):
            output_path = Path(output, post.parent.name)
            output_path.mkdir(exist_ok=True)
            convert_m2h(post, str(output_path))
        else:
            output_path = Path(output, post.name.split('.')[0])
            output_path.mkdir(exist_ok=True)
            convert_m2h(post, str(output_path))


def handle_list_conversion(input_dir, output_dir):
    input_path = Path(input_dir)
    for year in input_path.iterdir():
        if year.is_dir():
            output_path = Path(f"{output_dir}{year.name}")
            if not output_path.exists():
                output_path.mkdir(exist_ok=True)
            convert_year(year, str(output_path))


@click.command()
@click.option("--input", help="Input Path.")
@click.option("--output", help="Output Path")
def click_cli(input, output):
    global g_input
    global g_output
    g_input = input
    g_output = output
    handle_list_conversion(input, output)


if __name__ == "__main__":
    click_cli()
init 2023-11-30 20:37:36 +01:00			`#!/bin/env python3`
			`import re`
			`import yaml`
			`import shutil`
			`import click`
			`from datetime import datetime`
			`from pathlib import Path`

			`def migrate_header_img_and_return_new_path_for_post(img_path, img_new_path):`
			`is_http = re.compile(r'https?://')`
			`if is_http.match(img_path):`
			`return`
			`img_pathlib = Path(img_path)`
			`output_path = Path(img_new_path)`
			`if re.compile(r'/').match(img_path):`
tags and path thinks 2023-11-30 21:09:15 +01:00			`old = str(Path(g_input).parent) + img_path`
init 2023-11-30 20:37:36 +01:00			`else:`
tags and path thinks 2023-11-30 21:09:15 +01:00			`old = g_input + str(output_path.parent.name) + '/' + str(output_path.name) + '/' + img_path`
init 2023-11-30 20:37:36 +01:00			`new = img_new_path + '/' + img_pathlib.name`
			`shutil.copyfile(old, new)`
			`return new.replace(new.split('/')[0], '/blog')`


			`def convert_m2h(input_file: str, output_dir: str):`
			`datetime_parse = re.compile(r"\b\d{4}-\d{2}-\d{2} \d{2}:\d{2}")`

			`# Read the YAML frontmatter`
			`with open(input_file, 'r') as file:`
			`contents = file.read()`
			`front_matter = re.findall(r'---(.*?)---', contents, re.DOTALL)[0]`

			`data = yaml.safe_load(front_matter)`

			`# Convert the fields`
			`if isinstance(data['date'], datetime):`
			`data['publishDate'] = data['date'].isoformat()`
			`if isinstance(data['date'], str):`
			`data['publishDate'] = date_checker(data['date'])`
			`del data['date']`
			`data['lastmod'] = data['publishDate']`

			`if 'tags' in data:`
			`if data['tags'] is None or data['tags'] == []:`
			`del data['tags']`
			`else:`
			`if not isinstance(data['tags'], list):`
tags and path thinks 2023-11-30 21:09:15 +01:00			`tag_list = list()`
init 2023-11-30 20:37:36 +01:00			`data['tags'] = data['tags'].split()`
tags and path thinks 2023-11-30 21:09:15 +01:00			`for t in data.get('tags'):`
			`tag_list.append(t.strip(','))`
			`data['tags'] = tag_list`
init 2023-11-30 20:37:36 +01:00
			`if 'ws' in data:`
			`data['categories'] = 'event'`
			`if 'where' in data['ws']:`
			`data['location'] = data['ws'].pop('where')`
			`if 'when' in data['ws']:`
			`new_date = data['ws']['when']`
			`if datetime_parse.match(str(new_date)):`
			`data['date'] = date_checker(new_date)`

			`del data['ws']`
			`else:`
			`data['categories'] = 'article'`

			`if 'header_image' in data:`
			`new_img_path = migrate_header_img_and_return_new_path_for_post(data['header_image'], output_dir)`
			`header = {'image': new_img_path, 'caption': 'Sorry this blog entry was migrated, there is no Alt-Text'}`
			`del data['header_image']`
			`data['header'] = header`

			`if 'author' in data:`
			`data['authors'] = data['author']['display_name'].split(', ')`
			`# delete fields`
			`del data['author']`

			`if 'meta' in data:`
			`del data['meta']`
			`if 'published' in data:`
			`del data['published']`
			`if 'status' in data:`
			`del data['status']`
			`if 'layout' in data:`
			`del data['layout']`
			`if 'type' in data:`
			`del data['type']`

			`data['draft'] = False`

			`old_not_front_matter = re.findall(r'---.?---(.)', contents, re.DOTALL)[0]`

			`pattern = re.compile("{.*}", re.DOTALL)`
			`new_not_front_matter = pattern.sub("", old_not_front_matter)`

			`# Create the new YAML frontmatter`
			`new_front_matter = '\n' + yaml.safe_dump(data)`

			`new_contents = contents.replace(front_matter, new_front_matter)`
			`new_contents = new_contents.replace(old_not_front_matter, new_not_front_matter)`

			`# Write the new contents to the file`
			`with open(output_dir + "/index.md", 'w') as file:`
			`file.write(new_contents)`


			`def date_checker(new_date: str):`
			`dt_with_sek_tz = re.compile(r"\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2} .*")`
			`dt_with_sek = re.compile(r"\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}")`
			`dt_min_tz = re.compile(r"\d{4}-\d{2}-\d{2} \d{2}:\d{2} .*")`
			`dt_min = re.compile(r"\d{4}-\d{2}-\d{2} \d{2}:\d{2}")`
			`if dt_with_sek_tz.match(str(new_date)):`
			`return datetime.strptime(str(new_date), '%Y-%m-%d %H:%M:%S %Z').astimezone().isoformat()`
			`if dt_with_sek.match(str(new_date)):`
			`return datetime.strptime(str(new_date), '%Y-%m-%d %H:%M:%S').astimezone().isoformat()`
			`if dt_min_tz.match(str(new_date)):`
			`return datetime.strptime(str(new_date), '%Y-%m-%d %H:%M %Z').astimezone().isoformat()`
			`if dt_min.match(str(new_date)):`
			`return datetime.strptime(str(new_date), '%Y-%m-%d %H:%M').astimezone().isoformat()`


			`def convert_year(year_path, output):`
			`pattern = re.compile("\d{4}-{1}\d{2}-{1}\d{2}-.*")`
			`postsOfYear = list(year_path.rglob('*/.m*'))`
			`for post in postsOfYear:`
			`if pattern.match(post.parent.name):`
			`output_path = Path(output, post.parent.name)`
			`output_path.mkdir(exist_ok=True)`
			`convert_m2h(post, str(output_path))`
			`else:`
			`output_path = Path(output, post.name.split('.')[0])`
			`output_path.mkdir(exist_ok=True)`
			`convert_m2h(post, str(output_path))`


			`def handle_list_conversion(input_dir, output_dir):`
			`input_path = Path(input_dir)`
			`for year in input_path.iterdir():`
			`if year.is_dir():`
			`output_path = Path(f"{output_dir}{year.name}")`
			`if not output_path.exists():`
			`output_path.mkdir(exist_ok=True)`
			`convert_year(year, str(output_path))`


			`@click.command()`
			`@click.option("--input", help="Input Path.")`
			`@click.option("--output", help="Output Path")`
			`def click_cli(input, output):`
tags and path thinks 2023-11-30 21:09:15 +01:00			`global g_input`
			`global g_output`
			`g_input = input`
			`g_output = output`
init 2023-11-30 20:37:36 +01:00			`handle_list_conversion(input, output)`


			`if __name__ == "__main__":`
			`click_cli()`