Decoding Exported Data by Parsing conversations.json and/or chat.html

After exporting the chatgpt data, I would like to recreate the conversations similar to what is available in chat.html. It is not immediately clear to me how the mapping is structured in conversations.json. Is this documented somewhere?

I also tried to copy/paste chat.html into ChatGPT to see if I could get ChatGPT to figure out what the html code was doing, but I got a violation of terms acknowledgement window. Does anyone know why that might be?

4 Likes

Nevermind on part 2. It was related to having a really long line of json data apparently.

Finally got it working. Here is the conversation:

https://chat.openai.com/share/761c4e0c-b4a8-4c1b-b4bf-3ce32e9e3602

I was able to eventually get ChatGPT to get it. I didnā€™t have to understand much of the code at all, but it did take a few hours to navigate the straying, etc.

The code at the bottom seems to work ok.

3 Likes

Hi, Iā€™m working on the same thing; I had chatgpt write a python script to convert the json file into something I would work with using tiddlywiki as an importer. Any progress on this? Does anyone know if my conversations.json file is available via API?

If you log into the site, you can export your conversations (via settings). The files are produced and a link/ url is emailed to you. Click on the link and it will download. But be aware that the conversations.json file format has recently changed . The example code here is out of date.

Iā€™m working on a tool to do the sort of thing described here (in my case, itā€™s to create local markdown files from an export). Itā€™s working great so far but thereā€™s still a little more to do. If youā€™d like to try it out, shoot me a message at marty at martiansoftware dot com and Iā€™ll send you a copy as soon as itā€™s ready.

Mate, thanks a lot! Works like a charm. I did a little polishing so one can use it easily as CLI. Also changed output name format so chat files are sorted by date when already sorted by name ( Github Gist fd351f01a5d561d433ae852fba8eca0a ):

"""
This script processes conversation data from a JSON file, extracts messages,
and writes them to text files. It also creates a summary JSON file with a summary
of the conversations. The script is designed to be run as a command-line interface (CLI),
allowing the user to specify the input JSON file and output directory.

Usage:
    python script_name.py /path/to/conversations.json /path/to/output_directory
"""

import unicodedata
import json
import re
import argparse
from datetime import datetime
from pathlib import Path


def extract_message_parts(message):
    """
    Extract the text parts from a message content.

    Args:
        message (dict): A message object.

    Returns:
        list: List of text parts.
    """
    content = message.get("content")
    if content and content.get("content_type") == "text":
        return content.get("parts", [])
    return []


def get_author_name(message):
    """
    Get the author name from a message.

    Args:
        message (dict): A message object.

    Returns:
        str: The author's role or a custom label.
    """
    author = message.get("author", {}).get("role", "")
    if author == "assistant":
        return "ChatGPT"
    elif author == "system":
        return "Custom user info"
    return author


def get_conversation_messages(conversation):
    """
    Extract messages from a conversation.

    Args:
        conversation (dict): A conversation object.

    Returns:
        list: List of messages with author and text.
    """
    messages = []
    current_node = conversation.get("current_node")
    mapping = conversation.get("mapping", {})
    while current_node:
        node = mapping.get(current_node, {})
        message = node.get("message") if node else None
        if message:
            parts = extract_message_parts(message)
            author = get_author_name(message)
            if parts and len(parts) > 0 and len(parts[0]) > 0:
                if author != "system" or message.get("metadata", {}).get(
                    "is_user_system_message"
                ):
                    messages.append({"author": author, "text": parts[0]})
        current_node = node.get("parent") if node else None
    return messages[::-1]


def create_directory(base_dir, date):
    """
    Create a directory based on the date.

    Args:
        base_dir (Path): Base output directory.
        date (datetime): The date to base the directory name on.

    Returns:
        Path: The path of the created directory.
    """
    directory_name = date.strftime("%Y_%m")
    directory_path = base_dir / directory_name
    directory_path.mkdir(parents=True, exist_ok=True)
    return directory_path


def sanitize_title(title):
    """
    Sanitize the title to create a valid file name, preserving non-ASCII characters.

    Args:
        title (str): The title of the conversation.

    Returns:
        str: Sanitized title.
    """
    title = unicodedata.normalize("NFKC", title)
    title = re.sub(r'[<>:"/\\|?*\x00-\x1F\s]', '_', title)
    return title[:140]


def create_file_name(directory_path, title, date):
    """
    Create a sanitized file name.

    Args:
        directory_path (Path): The directory where the file will be saved.
        title (str): The title of the conversation.
        date (datetime): The date to base the file name on.

    Returns:
        Path: The path of the created file.
    """
    sanitized_title = sanitize_title(title)
    return (
        directory_path / f"{date.strftime('%Y_%m_%d')}_{sanitized_title}.txt"
    )


def write_messages_to_file(file_path, messages):
    """
    Write messages to a text file.

    Args:
        file_path (Path): The path of the file to write to.
        messages (list): List of messages to write.
    """
    with file_path.open("w", encoding="utf-8") as file:
        for message in messages:
            file.write(f"{message['author']}\n")
            file.write(f"{message['text']}\n")


def update_conversation_summary(summary, directory_name, conversation, date, messages):
    """
    Update the conversation summary dictionary.

    Args:
        summary (dict): The conversation summary dictionary.
        directory_name (str): The name of the directory.
        conversation (dict): The conversation object.
        date (datetime): The updated date of the conversation.
        messages (list): List of messages in the conversation.
    """
    if directory_name not in summary:
        summary[directory_name] = []

    summary[directory_name].append(
        {
            "title": conversation.get("title", "Untitled"),
            "create_time": datetime.fromtimestamp(
                conversation.get("create_time")
            ).strftime("%Y-%m-%d %H:%M:%S"),
            "update_time": date.strftime("%Y-%m-%d %H:%M:%S"),
            "messages": messages,
        }
    )


def write_summary_json(output_dir, summary):
    """
    Write the conversation summary to a JSON file.

    Args:
        output_dir (Path): The output directory.
        summary (dict): The conversation summary to write.
    """
    summary_json_path = output_dir / "conversation_summary.json"
    with summary_json_path.open("w", encoding="utf-8") as json_file:
        json.dump(summary, json_file, ensure_ascii=False, indent=4)


def write_conversations_and_summary(conversations_data, output_dir):
    """
    Write conversation messages to text files and create a conversation summary JSON file.

    Args:
        conversations_data (list): List of conversation objects.
        output_dir (Path): Directory to save the output files.

    Returns:
        list: Information about created directories and files.
    """
    created_directories_info = []
    conversation_summary = {}

    for conversation in conversations_data:
        updated = conversation.get("update_time")
        if not updated:
            continue

        updated_date = datetime.fromtimestamp(updated)
        directory_path = create_directory(output_dir, updated_date)
        title = conversation.get("title", "Untitled")
        file_name = create_file_name(directory_path, title, updated_date)

        messages = get_conversation_messages(conversation)
        write_messages_to_file(file_name, messages)

        update_conversation_summary(
            conversation_summary,
            directory_path.name,
            conversation,
            updated_date,
            messages,
        )

        created_directories_info.append(
            {"directory": str(directory_path), "file": str(file_name)}
        )

    write_summary_json(output_dir, conversation_summary)

    return created_directories_info


def main():
    """
    Main function to parse arguments and process the conversations.
    """
    parser = argparse.ArgumentParser(
        description="Process conversation data from a JSON file."
    )
    parser.add_argument(
        "input_file", type=Path, help="Path to the input conversations JSON file."
    )
    parser.add_argument(
        "output_dir", type=Path, help="Directory to save the output files."
    )

    args = parser.parse_args()

    if not args.input_file.exists():
        print(f"Error: The input file '{args.input_file}' does not exist.")
        return

    with args.input_file.open("r", encoding="utf-8") as file:
        conversations_data = json.load(file)

    created_directories_info = write_conversations_and_summary(
        conversations_data, args.output_dir
    )

    for info in created_directories_info:
        print(f"Created {info['file']} in directory {info['directory']}")


if __name__ == "__main__":
    main()

3 Likes

I just released the tool I mentioned above to convert ChatGPT exports into local Markdown files. Itā€™s called ChatKeeper, and Iā€™d love any feedback or suggestions for improvement you might have.

Links are not allowed here but it can be found on my website at martiansoftware dot com.

1 Like

why wouldnā€™t you build a chrome extension type of tool?

Because I wanted to act on the full conversation history, not just the current conversation, and I wanted full access to the local filesystem (which might be possible in a chrome extension but Iā€™m not sure).

Also I donā€™t use chrome. :slight_smile: And didnā€™t want to make something specific to one browser.

For those who are interested in using PowerShell here is a script that does the same thing:

# Prompt the user for the directory containing the conversations.json file
$inputDirectory = Read-Host "Please enter the directory where conversations.json is located"

# Define the function to get conversation messages
function Get-ConversationMessages {
    param ($conversation)

    $messages = @()
    $currentNode = $conversation.current_node
    $mapping = $conversation.mapping

    while ($currentNode) {
        $node = $mapping.$currentNode
        $message = $node.message
        $content = $message.content
        $author = if ($message.author) { $message.author.role } else { "" }

        if ($content -and $content.content_type -eq "text") {
            $parts = $content.parts
            if ($parts.Count -gt 0 -and $parts[0].Length -gt 0) {
                if ($author -ne "system" -or ($message.metadata).is_user_system_message) {
                    if ($author -eq "assistant") { $author = "ChatGPT" }
                    elseif ($author -eq "system") { $author = "Custom user info" }
                    
                    $messages += [pscustomobject]@{
                        Author = $author
                        Text   = $parts[0]
                    }
                }
            }
        }

        $currentNode = $node.parent
    }

    return $messages | Sort-Object -Descending
}

# Define the function to write conversations and create pruned.json
function Write-ConversationsAndJson {
    param ($conversationsData)

    # Get the directory where the script is saved
    $outputDirectory = $PSScriptRoot

    $createdDirectoriesInfo = @()
    $prunedData = @{}

    foreach ($conversation in $conversationsData) {
        $updated = $conversation.update_time
        if (-not $updated) { continue }

        # Convert Unix timestamp to DateTime
        $updatedDate = Get-Date ([DateTimeOffset]::FromUnixTimeSeconds($updated).DateTime)
        
        $directoryName = $updatedDate.ToString("MMMM_yyyy")
        $directoryPath = Join-Path $outputDirectory $directoryName

        if (-not (Test-Path $directoryPath)) {
            New-Item -Path $directoryPath -ItemType Directory | Out-Null
        }

        $title = if ($conversation.title) { $conversation.title } else { "Untitled" }
        $sanitizedTitle = $title -replace "[^a-zA-Z0-9_]", "_" -replace "^.{120}", "$&"
        $fileName = "$directoryPath/$sanitizedTitle_$($updatedDate.ToString('dd_MM_yyyy_HH_mm_ss')).txt"

        $messages = Get-ConversationMessages $conversation
        $messageContent = $messages | ForEach-Object { "$($_.Author)`n$($_.Text)`n" }

        Set-Content -Path $fileName -Value $messageContent -Encoding UTF8

        if (-not $prunedData[$directoryName]) {
            $prunedData[$directoryName] = @()
        }

        $prunedData[$directoryName] += @{
            Title       = $title
            Create_Time = (Get-Date ([DateTimeOffset]::FromUnixTimeSeconds($conversation.create_time).DateTime)).ToString("yyyy-MM-dd HH:mm:ss")
            Update_Time = $updatedDate.ToString("yyyy-MM-dd HH:mm:ss")
            Messages    = $messages
        }

        $createdDirectoriesInfo += @{
            Directory = $directoryPath
            File      = $fileName
        }
    }

    $jsonPrunedData = $prunedData | ConvertTo-Json -Depth 4 -Compress
    Set-Content -Path (Join-Path $outputDirectory "pruned.json") -Value $jsonPrunedData -Encoding UTF8

    return $createdDirectoriesInfo
}

# Load conversations.json data after prompting the user for the directory
$conversationFilePath = Join-Path $inputDirectory "conversations.json"

if (Test-Path $conversationFilePath) {
    $conversationData = Get-Content $conversationFilePath -Raw | ConvertFrom-Json
    $createdDirectoriesInfo = Write-ConversationsAndJson -conversationsData $conversationData
    Write-Host "Processing complete. Files saved in: $PSScriptRoot"
} else {
    Write-Host "Error: conversations.json not found in the provided directory."
}

Thatā€™s a cool PowerShell script and although I havenā€™t tried it, it looks useful for some basic use. Reading through it, however, I can confidently say it doesnā€™t do the same thing as my software. :slight_smile: But it might be a great alternative for some folks using Windows. The more options for backing up and having useful local copies of our conversations, the better.

I actually came here to say that thereā€™s a beta version of ChatKeeper (at martiansoftware dot com) available now for Mac users in case anyone was disappointed not to see Mac support earlier. I hope itā€™s helpful to folks.

Iā€™ve been working on some new features and improvements that Iā€™d love for folks to try out in a new release candidate for ChatKeeper version 1.1.0.

Rendering to Markdown any conversations that use the new Canvas and Search features requires a lot of json processing that even OpenAIā€™s own export viewer (html and javascript included in the export) doesnā€™t do.

Iā€™m not aware of any current issues with this version, but Iā€™m still finalizing testing for the official 1.1.0 releaseā€¦ so it might have issues I havenā€™t found yet. So while Iā€™m testing Iā€™m also very interested to hear how it works for you.

Hereā€™s Whatā€™s New in ChatKeeper 1.1.0-rc.1

  • Support for ChatGPTā€™s new Canvas feature: You can now save and manage your Canvas sessions with ChatKeeper. Please note that there are some known issues with ChatGPTā€™s export format for Canvas chats. Iā€™ve reported these to OpenAI and implemented workarounds where possible.
  • Support for ChatGPTā€™s new Search feature: ChatKeeper now formats ChatGPTā€™s search summaries and sources in Markdown.
  • Native Apple Silicon Support: ChatKeeper now runs natively on Apple Silicon (m1/m2/m3/m4). No more need for Rosetta complications.
  • Official Homebrew Installer: Mac users now have an easy way to install and update on their systems. The correct binary will be automatically installed for your platform.
  • Conversation Index by Start Date: A new index document organizes your conversations by their start dates for another navigation option (in addition to the previously existing index by last activity).
  • Numbered Conversation Turns: Conversation ā€œturnsā€ now have numbered Markdown headings in order to enable linking to specific messages within conversations.
  • Version Information Display: The ChatKeeper version is now included in both YAML front matter and in more user messages in both Markdown and the CLI.

- Marty

#!/home/jack/miniconda3/envs/cloned_base/bin/python
import json
import logging
import os
import glob
import subprocess
import os
import string
ā€˜ā€™ā€™
create working directory in it a directory called CHATGPT
in CHATGPT folder unzip the downloaded zip file of chatgpt dataset
run this script from the project folder it will create a three folders called:
directory1 = ā€˜CHATGPT/JSONā€™
make_path_exist(directory1)
directory2 = ā€˜CHATGPT/HTMLā€™
make_path_exist(directory2)
directory3 = ā€˜CHATGPT/TEXTā€™
make_path_exist(directory3)
it will convert the data into three forms txt, html and json
it will create a database called CHATGPT_files.db
it will create a table called files
it will insert the data into the table
it will close the connection
ā€˜ā€™ā€™
def clean_title(title):
valid_chars = set(string.ascii_letters + string.digits + string.whitespace)
cleaned_title = ā€˜ā€™.join(char if char in valid_chars else ā€˜ā€™ for char in title)
cleaned_title = cleaned_title.replace(ā€™ ', '
ā€™) # Replace spaces with underscores
return cleaned_title.strip()

make a function tooocreate folder if it doesnā€™t exist

ā€˜ā€™ā€™
This code defines a function make_path_exist that takes a directory path as input and creates the directory if it does not already exist. It then calls this function three times with different directory names.
ā€˜ā€™ā€™
def make_path_exist(directory):
path = os.path.join(os.getcwd(), directory)
if not os.path.exists(path):
os.makedirs(path)

def split_and_save_and_convert(conversations_file):
directory1 = ā€˜CHATGPT/JSONā€™
make_path_exist(directory1)
directory2 = ā€˜CHATGPT/HTMLā€™
make_path_exist(directory2)
directory3 = ā€˜CHATGPT/TEXTā€™
make_path_exist(directory3)
try:
with open(conversations_file, ā€˜rā€™, encoding=ā€˜utf-8ā€™) as file:
data = json.load(file)

        for conversation in data:
            title = conversation.get('title', 'Unknown_Title')
            title_with_underscores = clean_title(title)
            chapter_filename = f"{title_with_underscores}.json"
            chapter_filepath = os.path.join(directory1, chapter_filename)
            
            logging.info(f"Saving data for conversation '{title}' to {chapter_filepath}")
            
            with open(chapter_filepath, 'w', encoding='utf-8') as chapter_file:
                json.dump([conversation], chapter_file, indent=2)

            # Convert JSON to HTML
            html_output_file = os.path.join(directory2, f"{title_with_underscores}.html")
            convert_to_html(chapter_filepath, html_output_file)

            # Convert JSON to TXT
            txt_output_file = os.path.join(directory3, f"{title_with_underscores}.txt")
            convert_to_txt(chapter_filepath, txt_output_file)

except FileNotFoundError:
    logging.error(f"File not found: {conversations_file}")
except json.JSONDecodeError:
    logging.error(f"Error decoding JSON in file: {conversations_file}")
except Exception as e:
    logging.error(f"An unexpected error occurred: {e}")

def convert_to_html(json_file, html_output_file):
with open(json_file, ā€˜rā€™, encoding=ā€˜utf-8ā€™) as file:
json_data = json.load(file)

result_str = get_conversation_result(json_data)

with open(html_output_file, "w", encoding='utf-8') as html_output:
    result_html = result_str.replace("/n", "XXXXXXX\n")
    result_html = result_html.replace("<", "&lt;")
    result_html = result_html.replace(">", "&gt;")
    for line in result_html.split("XXXXXXX"):
        line = line.replace("\n", "<br />\n")
        html_output.write(line)

def convert_to_txt(json_file, txt_output_file):
with open(json_file, ā€˜rā€™, encoding=ā€˜utf-8ā€™) as file:
json_data = json.load(file)

result_str = get_conversation_result(json_data)

with open(txt_output_file, "w", encoding='utf-8') as txt_output:
    result_txt = result_str.replace("/n", "XXXXXXX\n")
    for line in result_txt.split("XXXXXXX"):
        txt_output.write(line)

def get_conversation_result(json_data):
result_str = ā€œā€
for conversation in json_data:
title = conversation.get(ā€˜titleā€™, ā€˜ā€™)
messages = get_conversation_messages(conversation)

    result_str += title + '\n'
    for message in messages:
        result_str += message['author'] + '\n' + message['text'] + '\n'
    result_str += '\n'

return result_str

def get_conversation_messages(conversation):
messages =
current_node = conversation.get(ā€˜current_nodeā€™)
while current_node:
node = conversation[ā€˜mappingā€™][current_node]
message = node.get(ā€˜messageā€™)
if (message and message.get(ā€˜contentā€™) and message[ā€˜contentā€™].get(ā€˜content_typeā€™) == ā€˜textā€™ and
len(message[ā€˜contentā€™].get(ā€˜partsā€™, )) > 0 and len(message[ā€˜contentā€™][ā€˜partsā€™][0]) > 0 and
(message[ā€˜authorā€™][ā€˜roleā€™] != ā€˜systemā€™ or message.get(ā€˜metadataā€™, {}).get(ā€˜is_user_system_messageā€™))):
author = message[ā€˜authorā€™][ā€˜roleā€™]
if author == ā€˜assistantā€™:
author = ā€˜ChatGPTā€™
elif author == ā€˜systemā€™ and message[ā€˜metadataā€™].get(ā€˜is_user_system_messageā€™):
author = ā€˜Custom user infoā€™
messages.append({ā€˜authorā€™: author, ā€˜textā€™: message[ā€˜contentā€™][ā€˜partsā€™][0]})
current_node = node.get(ā€˜parentā€™)
return messages[::-1]

Example usage

conversations_file_path = ā€˜CHATGPT/conversations.jsonā€™
#output_folder = ā€˜CHATDPT/output_txt_html_jsonā€™

Ensure the output folder exists

#os.makedirs(output_folder, exist_ok=True)

Configure logging

logging.basicConfig(level=logging.INFO)

Call the split, save, and convert function

split_and_save_and_convert(conversations_file_path)
import sqlite3
import os
import hashlib

Connect to SQLite database (creates a new database if it doesnā€™t exist)

db_path2 = ā€˜CHATGPT_files.dbā€™
conn = sqlite3.connect(db_path2)
cursor = conn.cursor()

Create a table to store file information

cursor.execute(ā€˜ā€™ā€™
CREATE TABLE IF NOT EXISTS files (
id INTEGER PRIMARY KEY,
filename TEXT NOT NULL,
content BLOB NOT NULL,
text_content TEXT NOT NULL,
hash_value TEXT NOT NULL,
format TEXT NOT NULL
)
ā€˜ā€™')

Commit changes and close the connection

conn.commit()
conn.close()

Function to calculate SHA-256 hash of a file

def calculate_hash(file_path):
sha256 = hashlib.sha256()
with open(file_path, ā€˜rbā€™) as file:
while chunk := file.read(8192): # Read in 8KB chunks
sha256.update(chunk)
return sha256.hexdigest()

Function to insert a file into the database

def insert_file(filename, content, text_content, hash_value, file_format):
conn = sqlite3.connect(db_path2)
cursor = conn.cursor()
cursor.execute(ā€˜INSERT INTO files (filename, content, text_content, hash_value, format) VALUES (?, ?, ?, ?, ?)ā€™,
(filename, content, text_content, hash_value, file_format))
conn.commit()
conn.close()

Function to insert HTML files recursively

def insert_text_files(directory):
for filename in os.listdir(directory): # Corrected variable name
if filename.endswith(ā€˜.txtā€™):
file_path = os.path.join(directory, filename) # Construct full file path
with open(file_path, ā€˜rbā€™) as file:
print(file_path)
file_content = file.read()

        text_content = file_content.decode('utf-8', errors='ignore')  # Convert bytes to string
        hash_value = calculate_hash(file_path)
        insert_file(filename, file_content, text_content, hash_value, 'txt')  # Corrected insertion
        print(f"Inserted: {filename}")

Example: Insert HTML files recursively from the specified directory

input_folder = ā€˜CHATGPT/TEXTā€™
insert_text_files(input_folder)

print(ā€˜Insertion process completed.ā€™)
#---------------------------------------------------
def clean_title(title):
valid_chars = set(string.ascii_letters + string.digits + string.whitespace)
cleaned_title = ā€˜ā€™.join(char if char in valid_chars else ā€˜ā€™ for char in title)
cleaned_title = cleaned_title.replace(ā€™ ', '
ā€™) # Replace spaces with underscores
return cleaned_title.strip()

make a function tooocreate folder if it doesnā€™t exist

ā€˜ā€™ā€™
This code defines a function make_path_exist that takes a directory path as input and creates the directory if it does not already exist. It then calls this function three times with different directory names.
ā€˜ā€™ā€™
def make_path_exist(directory):
path = os.path.join(os.getcwd(), directory)
if not os.path.exists(path):
os.makedirs(path)
def split_and_save_and_convert(conversations_file):
directory1 = ā€˜CHATGPT/JSONā€™
make_path_exist(directory1)
directory2 = ā€˜CHATGPT/HTMLā€™
make_path_exist(directory2)
directory3 = ā€˜CHATGPT/TEXTā€™
make_path_exist(directory3)
try:
with open(conversations_file, ā€˜rā€™, encoding=ā€˜utf-8ā€™) as file:
data = json.load(file)

        for conversation in data:
            title = conversation.get('title', 'Unknown_Title')
            title_with_underscores = clean_title(title)
            chapter_filename = f"{title_with_underscores}.json"
            chapter_filepath = os.path.join(directory1, chapter_filename)
            
            logging.info(f"Saving data for conversation '{title}' to {chapter_filepath}")
            
            with open(chapter_filepath, 'w', encoding='utf-8') as chapter_file:
                json.dump([conversation], chapter_file, indent=2)

            # Convert JSON to HTML
            html_output_file = os.path.join(directory2, f"{title_with_underscores}.html")
            convert_to_html(chapter_filepath, html_output_file)

            # Convert JSON to TXT
            txt_output_file = os.path.join(directory3, f"{title_with_underscores}.txt")
            convert_to_txt(chapter_filepath, txt_output_file)

except FileNotFoundError:
    logging.error(f"File not found: {conversations_file}")
except json.JSONDecodeError:
    logging.error(f"Error decoding JSON in file: {conversations_file}")
except Exception as e:
    logging.error(f"An unexpected error occurred: {e}")

def convert_to_html(json_file, html_output_file):
with open(json_file, ā€˜rā€™, encoding=ā€˜utf-8ā€™) as file:
json_data = json.load(file)

result_str = get_conversation_result(json_data)

with open(html_output_file, "w", encoding='utf-8') as html_output:
    result_html = result_str.replace("/n", "XXXXXXX\n")
    result_html = result_html.replace("<", "&lt;")
    result_html = result_html.replace(">", "&gt;")
    for line in result_html.split("XXXXXXX"):
        line = line.replace("\n", "<br />\n")
        html_output.write(line)

def convert_to_txt(json_file, txt_output_file):
with open(json_file, ā€˜rā€™, encoding=ā€˜utf-8ā€™) as file:
json_data = json.load(file)

result_str = get_conversation_result(json_data)

with open(txt_output_file, "w", encoding='utf-8') as txt_output:
    result_txt = result_str.replace("/n", "XXXXXXX\n")
    for line in result_txt.split("XXXXXXX"):
        txt_output.write(line)

def get_conversation_result(json_data):
result_str = ā€œā€
for conversation in json_data:
title = conversation.get(ā€˜titleā€™, ā€˜ā€™)
messages = get_conversation_messages(conversation)

    result_str += title + '\n'
    for message in messages:
        result_str += message['author'] + '\n' + message['text'] + '\n'
    result_str += '\n'

return result_str

def get_conversation_messages(conversation):
messages =
current_node = conversation.get(ā€˜current_nodeā€™)
while current_node:
node = conversation[ā€˜mappingā€™][current_node]
message = node.get(ā€˜messageā€™)
if (message and message.get(ā€˜contentā€™) and message[ā€˜contentā€™].get(ā€˜content_typeā€™) == ā€˜textā€™ and
len(message[ā€˜contentā€™].get(ā€˜partsā€™, )) > 0 and len(message[ā€˜contentā€™][ā€˜partsā€™][0]) > 0 and
(message[ā€˜authorā€™][ā€˜roleā€™] != ā€˜systemā€™ or message.get(ā€˜metadataā€™, {}).get(ā€˜is_user_system_messageā€™))):
author = message[ā€˜authorā€™][ā€˜roleā€™]
if author == ā€˜assistantā€™:
author = ā€˜ChatGPTā€™
elif author == ā€˜systemā€™ and message[ā€˜metadataā€™].get(ā€˜is_user_system_messageā€™):
author = ā€˜Custom user infoā€™
messages.append({ā€˜authorā€™: author, ā€˜textā€™: message[ā€˜contentā€™][ā€˜partsā€™][0]})
current_node = node.get(ā€˜parentā€™)
return messages[::-1]

Example usage

conversations_file_path = ā€˜CHATGPT/conversations.jsonā€™
#output_folder = ā€˜CHATDPT/output_txt_html_jsonā€™

Ensure the output folder exists

#os.makedirs(output_folder, exist_ok=True)

Configure logging

logging.basicConfig(level=logging.INFO)

Call the split, save, and convert function

#split_and_save_and_convert(conversations_file_path)
json_file=ā€˜CHATGPT/conversations.jsonā€™
txt_output_file =ā€œconversations_2_text.txtā€
convert_to_txt(json_file, txt_output_file)

Insert = open(ā€œconversations.txtā€,ā€œaā€)
with open(ā€œconversations_2_text.txtā€,ā€œrā€) as data:
lines = data.read()
line = lines.replace(ā€œuser\nā€,ā€œCHAT_DIALOGUEuser\nā€)
Insert.write(line)
import sqlite3
import logging

Configure logging

logging.basicConfig(level=logging.DEBUG)

def connect_to_database(database_name):
ā€œā€"
Connect to the SQLite database.

Args:
    database_name (str): The name of the SQLite database file.
Returns:
    sqlite3.Connection or None: The database connection or None if connection fails.
"""
try:
    conn = sqlite3.connect(database_name)
    logging.info("Connected to the database successfully.")
    return conn
except Exception as e:
    logging.error(f"Failed to connect to the database: {e}")
    return None

def create_table(conn):
ā€œā€"
Create the dialogue table in the database.
Args:
conn (sqlite3.Connection): The SQLite database connection.
ā€œā€"
try:
if conn:
c = conn.cursor()
c.execute(ā€˜ā€™ā€˜CREATE TABLE IF NOT EXISTS dialogue (
id INTEGER PRIMARY KEY,
user_ChatGPT_PAIR TEXT,
user_ChatGPT_PAIRb BLOB
)ā€™ā€˜ā€™)
conn.commit()
logging.info(ā€œTable ā€˜dialogueā€™ created successfully.ā€)
except Exception as e:
logging.error(f"Failed to create table ā€˜dialogueā€™: {e}ā€œ)
def insert_dialogue(conn, dialogue_data):
ā€œā€ā€
Insert dialogue data into the database.
Args:
conn (sqlite3.Connection): The SQLite database connection.
dialogue_data (str): The dialogue data to insert into the database.
ā€œā€"
try:
if conn:
c = conn.cursor()
c.execute(ā€œINSERT INTO dialogue (user_ChatGPT_PAIR, user_ChatGPT_PAIRb) VALUES (?,?)ā€, (dialogue_data,dialogue_data.encode(ā€˜utf-8ā€™),))
conn.commit()
logging.info(ā€œDialogue inserted into the database successfully.ā€)
except Exception as e:
logging.error(f"Failed to insert dialogue into the database: {e}")

Define the file path

file_path = ā€˜conversations.txtā€™

Read the file and insert dialogue into the database

try:
with open(file_path, ā€œrā€) as file:
file_contents = file.read()
dialogue_parts = file_contents.split(ā€œCHAT_DIALOGUEā€)
conn = connect_to_database(ā€˜dialogueEXP2.dbā€™)
if conn:
create_table(conn)
for dialogue_part in dialogue_parts:
insert_dialogue(conn, dialogue_part.strip())
print(ā€œ.ā€, end=ā€œ-ā€)
conn.close()
except Exception as e:
logging.error(f"An error occurred while reading or processing the file: {e}")