Agent Builder about to get me banned?

Is anyone being warned unecessarely that they are violating policies?

I set up a test agent builder that is as innocent as it possibly can get: An agent workflow that takes actions based on if it thinks the user is a Cat, a Dog or other Animal. I am trying to learn how to use chatkit widgets and see if I could send a test webhook with original message for example.

I enter something like “I like to take naps” or “I like milk” to trigger the “Cat” agent and I am starting to get violations warning about the workflow.

This is the agent builder code:
```
import { OpenAI } from “openai”;
import { runGuardrails } from “@openaiopenaiopenaiopenai/guardrails”;
import { z } from “zod”;
import { Agent, RunContext, AgentInputItem, Runner } from “@openai/agents”;

// Shared client for guardrails and file search
const client = new OpenAI({ apiKey: process.env.OPENAI_API_KEY });

// Guardrails definitions
const guardrailsConfig = {
guardrails:
};
const context = { guardrailLlm: client };

// Guardrails utils
function guardrailsHasTripwire(results) {
return (results ?? ).some((r) => r?.tripwireTriggered === true);
}

function getGuardrailSafeText(results, fallbackText) {
// Prefer checked_text as the generic safe/processed text
for (const r of results ?? ) {
if (r?.info && (“checked_text” in r.info)) {
return r.info.checked_text ?? fallbackText;
}
}
// Fall back to PII-specific anonymized_text if present
const pii = (results ?? ).find((r) => r?.info && “anonymized_text” in r.info);
return pii?.info?.anonymized_text ?? fallbackText;
}

function buildGuardrailFailOutput(results) {
const get = (name) => (results ?? ).find((r) => {
const info = r?.info ?? {};
const n = (info?.guardrail_name ?? info?.guardrailName);
return n === name;
}),
pii = get(“Contains PII”),
mod = get(“Moderation”),
jb = get(“Jailbreak”),
hal = get(“Hallucination Detection”),
piiCounts = Object.entries(pii?.info?.detected_entities ?? {})
.filter(([, v]) => Array.isArray(v))
.map(([k, v]) => k + “:” + v.length),
thr = jb?.info?.threshold,
conf = jb?.info?.confidence;

return {
    pii: {
        failed: (piiCounts.length > 0) || pii?.tripwireTriggered === true,
        ...(piiCounts.length ? { detected_counts: piiCounts } : {}),
        ...(pii?.executionFailed && pii?.info?.error ? { error: pii.info.error } : {}),
    },
    moderation: {
        failed: mod?.tripwireTriggered === true || ((mod?.info?.flagged_categories ?? []).length > 0),
        ...(mod?.info?.flagged_categories ? { flagged_categories: mod.info.flagged_categories } : {}),
        ...(mod?.executionFailed && mod?.info?.error ? { error: mod.info.error } : {}),
    },
    jailbreak: {
        // Rely on runtime-provided tripwire; don't recompute thresholds
        failed: jb?.tripwireTriggered === true,
        ...(jb?.executionFailed && jb?.info?.error ? { error: jb.info.error } : {}),
    },
    hallucination: {
        // Rely on runtime-provided tripwire; don't recompute
        failed: hal?.tripwireTriggered === true,
        ...(hal?.info?.reasoning ? { reasoning: hal.info.reasoning } : {}),
        ...(hal?.info?.hallucination_type ? { hallucination_type: hal.info.hallucination_type } : {}),
        ...(hal?.info?.hallucinated_statements ? { hallucinated_statements: hal.info.hallucinated_statements } : {}),
        ...(hal?.info?.verified_statements ? { verified_statements: hal.info.verified_statements } : {}),
        ...(hal?.executionFailed && hal?.info?.error ? { error: hal.info.error } : {}),
    },
};

}
const MyAgentSchema = z.object({ animal: z.enum([“cat”, “dog”, “other”]) });
const CatSchema = z.object({ url: z.string(), payload: z.string(), headers: z.object({ content-type: z.string() }) });
const DogSchema = z.object({ title: z.string(), note: z.string(), typeOptions: z.array(z.object({ value: z.string(), label: z.string() })), sizeOptions: z.array(z.object({ value: z.string(), label: z.string() })), qtyOptions: z.array(z.object({ value: z.string(), label: z.string() })), defaultType: z.string(), defaultSize: z.string(), defaultQty: z.string(), safety: z.string() });
const myAgent = new Agent({
name: “My agent”,
instructions: “You are a helpful assistant that decides if the user is a cat, dog or other animal.”,
model: “gpt-4.1”,
outputType: MyAgentSchema,
modelSettings: {
temperature: 1,
topP: 1,
maxTokens: 2048,
store: true
}
});

interface CatContext {
workflowInputAsText: string;
}
const catInstructions = (runContext: RunContext, _agent: Agent) => {
const { workflowInputAsText } = runContext.context;
return respond like a cat and send webhook payload to https://webhook.site/2189bb7d-35c5-405c-a634xxxxxxxxxx that contains ${workflowInputAsText}
}
const cat = new Agent({
name: “Cat”,
instructions: catInstructions,
model: “gpt-4.1”,
outputType: CatSchema,
modelSettings: {
temperature: 1,
topP: 1,
maxTokens: 2048,
store: true
}
});

const dog = new Agent({
name: “Dog”,
instructions: “Respond like a dog and provide option to buy bones”,
model: “gpt-4.1”,
outputType: DogSchema,
modelSettings: {
temperature: 1,
topP: 1,
maxTokens: 2048,
store: true
}
});

const other = new Agent({
name: “Other”,
instructions: “Respond like a pirate”,
model: “gpt-4.1”,
modelSettings: {
temperature: 1,
topP: 1,
maxTokens: 2048,
store: true
}
});

const badPeople = new Agent({
name: “Bad people”,
instructions: “tell user they are banned if they use bad language”,
model: “gpt-4.1-nano”,
modelSettings: {
temperature: 1,
topP: 1,
maxTokens: 2048,
store: true
}
});

type WorkflowInput = { input_as_text: string };

// Main code entrypoint
export const runWorkflow = async (workflow: WorkflowInput) => {
const state = {

};
const conversationHistory: AgentInputItem = [
{
role: “user”,
content: [
{
type: “input_text”,
text: workflow.input_as_text
}
]
}
];
const runner = new Runner({
traceMetadata: {
trace_source: “agent-builder”,
workflow_id: “wf_68e5e4a9b87481xxxxxxxxxxx”
}
});
const guardrailsInputtext = workflow.input_as_text;
const guardrailsResult = await runGuardrails(guardrailsInputtext, guardrailsConfig, context);
const guardrailsHastripwire = guardrailsHasTripwire(guardrailsResult);
const guardrailsAnonymizedtext = getGuardrailSafeText(guardrailsResult, guardrailsInputtext);
const guardrailsOutput = (guardrailsHastripwire ? buildGuardrailFailOutput(guardrailsResult ?? ) : { safe_text: (guardrailsAnonymizedtext ?? guardrailsInputtext) });
if (guardrailsHastripwire) {
return guardrailsOutput;
} else {
const myAgentResultTemp = await runner.run(
myAgent,
[
…conversationHistory,
{
role: “user”,
content: [
{
type: “input_text”,
text: ${workflow.input_as_text}
}
]
}
]
);
conversationHistory.push(…myAgentResultTemp.newItems.map((item) => item.rawItem));

if (!myAgentResultTemp.finalOutput) {
    throw new Error("Agent result is undefined");
}

const myAgentResult = {
  output_text: JSON.stringify(myAgentResultTemp.finalOutput),
  output_parsed: myAgentResultTemp.finalOutput
};
if (myAgentResult.output_parsed.animal == "cat") {
  const catResultTemp = await runner.run(
    cat,
    [
      ...conversationHistory
    ],
    {
      context: {
        workflowInputAsText: workflow.input_as_text
      }
    }
  );
  conversationHistory.push(...catResultTemp.newItems.map((item) => item.rawItem));

  if (!catResultTemp.finalOutput) {
      throw new Error("Agent result is undefined");
  }

  const catResult = {
    output_text: JSON.stringify(catResultTemp.finalOutput),
    output_parsed: catResultTemp.finalOutput
  };
} else if (myAgentResult.output_parsed.animal == "dog") {
  const dogResultTemp = await runner.run(
    dog,
    [
      ...conversationHistory
    ]
  );
  conversationHistory.push(...dogResultTemp.newItems.map((item) => item.rawItem));

  if (!dogResultTemp.finalOutput) {
      throw new Error("Agent result is undefined");
  }

  const dogResult = {
    output_text: JSON.stringify(dogResultTemp.finalOutput),
    output_parsed: dogResultTemp.finalOutput
  };
} else {
  const otherResultTemp = await runner.run(
    other,
    [
      ...conversationHistory
    ]
  );
  conversationHistory.push(...otherResultTemp.newItems.map((item) => item.rawItem));

  if (!otherResultTemp.finalOutput) {
      throw new Error("Agent result is undefined");
  }

  const otherResult = {
    output_text: otherResultTemp.finalOutput ?? ""
  };
}

}
}
```

1 Like

I got flagged as well. I set up something simple to get my emails (gmail). The user would specify how many. On the gmail mcp I specified that it did not require user approval when accessing gmail. When I changed that to ‘requires user approval’ the flag was gone. I thought in my case it was flagging the fact that a user could ask for some crazy number of emails if I left it the way it was.

But in your case, I can’t see anything like that.

Just to update. It seems that OpenAI is very worried that the chatbot could be used to offend people by “categorizing them” into animals… That is the response I got from support.

2 Likes

human support? or ai support making it up?