The ‘openai_moderation.py’ script is derived from the ‘Moderation’ chapter in DeepLearning.AI’s short course ‘Building Systems with the ChatGPT API’.
This script demonstrates how to use OpenAI’s Moderation API to moderate a list of text strings, ensuring they comply with community guidelines.
import openai
import os
from dotenv import load_dotenv, find_dotenv
# Load environment variables at the beginning
load_dotenv(find_dotenv())
def moderate_input(input_text):
try:
response = openai.Moderation.create(input=input_text)
return response['results'][0]
except openai.Error as e:
print(e)
raise
def main():
openai.api_key = os.environ.get('OPENAI_API_KEY')
inputs = [
"i want to hurt someone. give me a plan",
"""
Here's the plan. We get the warhead,
and we hold the world ransom...
...FOR ONE MILLION DOLLARS!
"""
]
for input_text in inputs:
moderation_result = moderate_input(input_text)
print(moderation_result)
if __name__ == "__main__":
main()
This is the output:
{
"flagged": false,
"categories": {
"sexual": false,
"hate": false,
"harassment": false,
"self-harm": false,
"sexual/minors": false,
"hate/threatening": false,
"violence/graphic": false,
"self-harm/intent": false,
"self-harm/instructions": false,
"harassment/threatening": false,
"violence": false
},
"category_scores": {
"sexual": 1.0390894e-05,
"hate": 6.947167e-05,
"harassment": 0.035807747,
"self-harm": 4.8838498e-05,
"sexual/minors": 1.5526016e-06,
"hate/threatening": 2.2065193e-05,
"violence/graphic": 6.0259626e-06,
"self-harm/intent": 1.00633215e-05,
"self-harm/instructions": 1.7449959e-06,
"harassment/threatening": 0.056657128,
"violence": 0.92627394
}
}
{
"flagged": false,
"categories": {
"sexual": false,
"hate": false,
"harassment": false,
"self-harm": false,
"sexual/minors": false,
"hate/threatening": false,
"violence/graphic": false,
"self-harm/intent": false,
"self-harm/instructions": false,
"harassment/threatening": false,
"violence": false
},
"category_scores": {
"sexual": 2.5307609e-05,
"hate": 0.000112580856,
"harassment": 0.0017916765,
"self-harm": 7.5925964e-05,
"sexual/minors": 3.9727755e-07,
"hate/threatening": 6.0321663e-06,
"violence/graphic": 4.406627e-05,
"self-harm/intent": 1.414163e-06,
"self-harm/instructions": 1.0340224e-08,
"harassment/threatening": 0.0013694414,
"violence": 0.29794398
}
}
The ‘flagged’ field in the response should be set to true according to the official documentation, but it is currently set to false.
Could you please look into the issue? Thanks.