Fine-Tuning of my personal blog

tiktoken, which I mentioned above, is a python token counting library that you can install simply with pip.

pip install tiktoken

Because I’m a fun guy, I just sat down and coded a per-line token counter for jsonl training files.

import json
import tiktoken

class Tokenizer:
    def __init__(self, model_name):
        self.tokenizer = tiktoken.get_encoding(model_name)

    def count(self, text):
        encoded_text = self.tokenizer.encode(text)
        return len(encoded_text)

    def message(self, message):
        for msg in message:
            role_string = msg['role']
            if 'name' in msg:
                role_string += ':' + msg['name']
            role_tokens = self.count(role_string)
            content_tokens = self.count(msg['content'])
            msg['tokens'] = 3 + role_tokens + content_tokens
        return message if len(message) > 1 else message[0]


cl100k = Tokenizer("cl100k_base")
# Example string
trainfile = """
{"messages": [{"role": "system", "content": "Marv is a factual chatbot that is also sarcastic."}, {"role": "user", "content": "What's the capital of France?"}, {"role": "assistant", "content": "Paris, as if everyone doesn't know that already."}]}
{"messages": [{"role": "system", "content": "Marv is a factual chatbot that is also sarcastic."}, {"role": "user", "content": "Who wrote 'Romeo and Juliet'?"}, {"role": "assistant", "content": "Oh, just some guy named William Shakespeare. Ever heard of him?"}]}
{"messages": [{"role": "system", "content": "Marv is a factual chatbot that is also sarcastic."}, {"role": "user", "content": "How far is the Moon from Earth?"}, {"role": "assistant", "content": "Around 384,400 kilometers. Give or take a few, like that really matters."}]}
""".strip()

# Uncomment the following lines to read from a file instead
# with open('filename.jsonl', 'r') as f:
#     trainfile = f.read()
max_line = 52  # report over this many tokens (4080)
tokentable = []; over_max_line = []; total_tokens_list = []
for i, line in enumerate(trainfile.split('\n'), start=1):
    try:
        jsonl_line = json.loads(line)
        token_msg_list = cl100k.message(jsonl_line['messages'])
        total_tokens = 3  # overhead
        # Iterate through all the dictionaries in the list
        for msg in token_msg_list:
            # Get all the token values and add them
            total_tokens += msg.get('tokens', 0)
            if total_tokens > max_line:
                print(f"line {i}:Total tokens are over {max_line}")
                over_max_line.append(i)
        total_tokens_list.append(total_tokens)
        #tokentable.append(token_msg_list)
    except json.JSONDecodeError:
        print(f"Error decoding JSON on line {i}: {line[:320]}")

print(total_tokens_list)

It uses a stripped-down class I already wrote to add token metadata to lists of messages. (If you have functions, you’ll need a preprocessor to add the AI-received language to the system message for token counting)

The other things you can edit within code should be apparent upon understanding what it does.

Execution on the “marv” example and a low threshold:

line 3:Total tokens are over 52
[46, 51, 54]

It should also barf errors or crash appropriately on bad files.

1 Like