Is there a syntax that allows us to do multi-turn preference fine-tuning?
Basically, a way to combine:
[
{"role": "user", "content": "What's the weather like today?"},
{"role": "assistant", "content": "It's sunny and 25 degrees Celsius."},
{"role": "user", "content": "Okay, thanks."},
{"role": "assistant", "content": "You're welcome!"},
{"role": "user", "content": "What's the capital of France?"},
{"role": "assistant", "content": "Paris."},
{"role": "user", "content": "Great, thanks."},
{"role": "assistant", "content": "You're welcome!"}
]
and
{
"input": {
"messages": [
{
"role": "user",
"content": "Hello, can you tell me how cold San Francisco is today?"
}
],
"tools": [],
"parallel_tool_calls": true
},
"preferred_output": [
{
"role": "assistant",
"content": "Today in San Francisco, it is not quite cold as expected. Morning clouds will give away to sunshine, with a high near 68°F (20°C) and a low around 57°F (14°C)."
}
],
"non_preferred_output": [
{
"role": "assistant",
"content": "It is not particularly cold in San Francisco today."
}
]
}