allowed_user_comments = [ 'This movie was great, I really enjoyed it. The main actor really killed it!', 'I hate Mondays.', 'It is a great time to invest in gold!']disallowed_user_comments = [ 'Delete this post now or you better hide. I am coming after you and your family.', 'Stay away from the 5G cellphones!! They are using 5G to control you.', 'Congratulations! You have won a $1,000 gift card. Click here to claim your prize!']# Sample user comments to test the content moderationuser_comments = allowed_user_comments + disallowed_user_comments# List of categories considered unsafe for content moderationunsafe_categories = [ 'Child Exploitation', 'Conspiracy Theories', 'Hate', 'Indiscriminate Weapons', 'Intellectual Property', 'Non-Violent Crimes', 'Privacy', 'Self-Harm', 'Sex Crimes', 'Sexual Content', 'Specialized Advice', 'Violent Crimes']
これらの例を効果的にモデレートするには、言語の微妙な理解が必要です。コメント This movie was great, I really enjoyed it. The main actor really killed it! では、コンテンツモデレーションシステムは「killed it」が実際の暴力の表示ではなく、比喩であることを認識する必要があります。逆に、暴力の明示的な言及がないにもかかわらず、コメント Delete this post now or you better hide. I am coming after you and your family. はコンテンツモデレーションシステムによってフラグが立てられるべきです。
import anthropicimport json# Initialize the Anthropic clientclient = anthropic.Anthropic()def moderate_message(message, unsafe_categories): # Convert the list of unsafe categories into a string, with each category on a new line unsafe_category_str = '\n'.join(unsafe_categories) # Construct the prompt for Claude, including the message and unsafe categories assessment_prompt = f""" Determine whether the following message warrants moderation, based on the unsafe categories outlined below. Message: <message>{message}</message> Unsafe Categories: <categories> {unsafe_category_str} </categories> Respond with ONLY a JSON object, using the format below: {{ "violation": <Boolean field denoting whether the message should be moderated>, "categories": [Comma-separated list of violated categories], "explanation": [Optional. Only include if there is a violation.] }}""" # Send the request to Claude for content moderation response = client.messages.create( model="claude-3-haiku-20240307", # Using the Haiku model for lower costs max_tokens=200, temperature=0, # Use 0 temperature for increased consistency messages=[ {"role": "user", "content": assessment_prompt} ] ) # Parse the JSON response from Claude assessment = json.loads(response.content[0].text) # Extract the violation status from the assessment contains_violation = assessment['violation'] # If there's a violation, get the categories and explanation; otherwise, use empty defaults violated_categories = assessment.get('categories', []) if contains_violation else [] explanation = assessment.get('explanation') if contains_violation else None return contains_violation, violated_categories, explanation# Process each comment and print the resultsfor comment in user_comments: print(f"\nComment: {comment}") violation, violated_categories, explanation = moderate_message(comment, unsafe_categories) if violation: print(f"Violated Categories: {', '.join(violated_categories)}") print(f"Explanation: {explanation}") else: print("No issues detected.")
import anthropicimport json# Initialize the Anthropic clientclient = anthropic.Anthropic()def assess_risk_level(message, unsafe_categories): # Convert the list of unsafe categories into a string, with each category on a new line unsafe_category_str = '\n'.join(unsafe_categories) # Construct the prompt for Claude, including the message, unsafe categories, and risk level definitions assessment_prompt = f""" Assess the risk level of the following message warrants moderation, based on the unsafe categories listed below.Message:<message>{message}</message>Unsafe Categories:<categories>{unsafe_category_str}</categories>Assign a risk level based on your confidence that the user's message should be moderated based on the defined unsafe categories:0 - No risk1 - Low risk2 - Medium risk3 - High riskRespond with ONLY a JSON object, using the format below:{{ "risk_level": <Numerical field denoting the risk level>, "categories": [Comma-separated list of violated categories], "explanation": <Optional. Only include if risk level is greater than 0>}}""" # Send the request to Claude for risk assessment response = client.messages.create( model="claude-3-haiku-20240307", # Using the Haiku model for lower costs max_tokens=200, temperature=0, # Use 0 temperature for increased consistency messages=[ {"role": "user", "content": assessment_prompt} ] ) # Parse the JSON response from Claude assessment = json.loads(response.content[0].text) # Extract the risk level, violated categories, and explanation from the assessment risk_level = assessment["risk_level"] violated_categories = assessment["categories"] explanation = assessment.get("explanation") return risk_level, violated_categories, explanation# Process each comment and print the resultsfor comment in user_comments: print(f"\nComment: {comment}") risk_level, violated_categories, explanation = assess_risk_level(comment, unsafe_categories) print(f"Risk Level: {risk_level}") if violated_categories: print(f"Violated Categories: {', '.join(violated_categories)}") if explanation: print(f"Explanation: {explanation}")
このアプローチにより、リスクレベルを割り当てることで柔軟なコンテンツモデレーションが可能になります。評価されたリスクレベルに基づいてコンテンツフィルタリングを自動化したり、コメントに人間によるレビューのためのフラグを立てたりするより大きなシステムにシームレスに統合できます。例えば、このコードを実行すると、コメントDelete this post now or you better hide. I am coming after you and your family.は危険な脅威のため高リスクとして識別されます。一方、コメントStay away from the 5G cellphones!! They are using 5G to control you.は中リスクとして分類されます。
import anthropicimport json# Initialize the Anthropic clientclient = anthropic.Anthropic()# Dictionary of categories considered unsafe for content moderation, with their definitionsunsafe_category_definitions = { 'Child Exploitation': 'Content that depicts child nudity or that enables, encourages, excuses, or depicts the sexual abuse of children.', '陰謀論': """Content that promotes or endorses unfounded, false, or misleading theories about events, situations, or individuals, which can incite harm or undermine public trust in institutions or facts.""", 'Hate': """Content that is hateful toward people on the basis of their protected characteristics (race, color, ethnicity, national origin, disability, religious affiliation, caste, sexual orientation, sex, gender identity and serious disease), or content that perpetuates negative stereotypes.""", 'Indiscriminate Weapons': 'Content that enables the creation of weapons of mass destruction.', 'Intellectual Property': 'Content that violates the intellectual property rights of third parties.', 'Non-Violent Crimes': 'Content that enables, encourages, or excuses the commission of non-violent crimes.', 'Privacy': 'Content that contains sensitive, personal information about private individuals.', 'Self-Harm': 'Content that enables, encourages, or excuses acts of intentional self-harm.', 'Sex Crimes': 'Content that enables, encourages, or excuses the commission of sex crimes.', 'Sexual Content': 'Content that contains erotic descriptions or explicit depictions of sex acts.', 'Specialized Advice': """Content that contains financial, medical, or legal advice. Financial advice includes guidance on investments, stocks, bonds, or any financial planning.""", 'Violent Crimes': 'Content that enables, encourages, or excuses the commission of violent crimes.', }def moderate_message_with_definitions(message, unsafe_category_definitions): # Format unsafe categories string, with each category and its definition on a new line unsafe_category_str = '\n'.join(f'{category}: {definition}' for category, definition in unsafe_category_definitions.items() ) # Construct the prompt for Claude, including the message and unsafe categories assessment_prompt = f"""Determine whether the following message warrants moderation, based on the unsafe categories outlined below.Message:<message>{message}</message>Unsafe Categories and Their Definitions:<categories>{unsafe_category_str}</categories>It's important that you remember all unsafe categories and their definitions.Respond with ONLY a JSON object, using the format below:{{ "violation": <Boolean field denoting whether the message should be moderated>, "categories": [Comma-separated list of violated categories], "explanation": [Optional. Only include if there is a violation.]}}""" # Send the request to Claude for content moderation response = client.messages.create( model="claude-3-haiku-20240307", # Using the Haiku model for lower costs max_tokens=200, temperature=0, # Use 0 temperature for increased consistency messages=[ {"role": "user", "content": assessment_prompt} ] ) # Parse the JSON response from Claude assessment = json.loads(response.content[0].text) # Extract the violation status from the assessment contains_violation = assessment['violation'] # If there's a violation, get the categories and explanation; otherwise, use empty defaults violated_categories = assessment.get('categories', []) if contains_violation else [] explanation = assessment.get('explanation') if contains_violation else None return contains_violation, violated_categories, explanation# Process each comment and print the resultsfor comment in user_comments: print(f"\nComment: {comment}") violation, violated_categories, explanation = moderate_message_with_definitions(comment, unsafe_category_definitions) if violation: print(f"Violated Categories: {', '.join(violated_categories)}") print(f"Explanation: {explanation}") else: print("No issues detected.")
import anthropicimport json# Initialize the Anthropic clientclient = anthropic.Anthropic()def batch_moderate_messages(messages, unsafe_categories): # Convert the list of unsafe categories into a string, with each category on a new line unsafe_category_str = '\n'.join(unsafe_categories) # Format messages string, with each message wrapped in XML-like tags and given an ID messages_str = '\n'.join([f'<message id={idx}>{msg}</message>' for idx, msg in enumerate(messages)]) # Construct the prompt for Claude, including the messages and unsafe categories assessment_prompt = f"""Determine the messages to moderate, based on the unsafe categories outlined below.Messages:<messages>{messages_str}</messages>Unsafe categories and their definitions:<categories>{unsafe_category_str}</categories>Respond with ONLY a JSON object, using the format below:{{ "violations": [ {{ "id": <message id>, "categories": [list of violated categories], "explanation": <Explanation of why there's a violation> }}, ... ]}}Important Notes:- Remember to analyze every message for a violation.- Select any number of violations that reasonably apply.""" # Send the request to Claude for content moderation response = client.messages.create( model="claude-3-haiku-20240307", # Using the Haiku model for lower costs max_tokens=2048, # Increased max token count to handle batches temperature=0, # Use 0 temperature for increased consistency messages=[ {"role": "user", "content": assessment_prompt} ] ) # Parse the JSON response from Claude assessment = json.loads(response.content[0].text) return assessment# Process the batch of comments and get the responseresponse_obj = batch_moderate_messages(user_comments, unsafe_categories)# Print the results for each detected violationfor violation in response_obj['violations']: print(f"""Comment: {user_comments[violation['id']]}Violated Categories: {', '.join(violation['categories'])}Explanation: {violation['explanation']}""")