from document_polluter import DocumentPolluter
import yaml
import os
import requests
import json
from collections import defaultdict
from scipy import stats
with open('credentials.yaml') as file:
credentials = yaml.load(file, Loader=yaml.FullLoader)
with open('paragraphs/us_race.yaml') as file:
documents = yaml.load(file, Loader=yaml.FullLoader)
dp = DocumentPolluter(documents=documents, genre='us-race')
len(dp.eligible_documents)
20
def get_google_sentiment(document):
url = f"https://language.googleapis.com/v1/documents:analyzeSentiment?key={credentials['google']['key']}"
headers = {'content-type': 'application/json'}
data = {
'document': {
'type': 'PLAIN_TEXT',
'content': document
}
}
r = requests.post(url=url, data=json.dumps(data), headers=headers)
return json.loads(r.text)['documentSentiment']
sentiment = defaultdict(list)
for genre, documents in dp.polluted_documents.items():
for document in documents:
sentiment[genre].append(get_google_sentiment(document))
asian_scores = [x['score'] for x in sentiment['asian']]
black_scores = [x['score'] for x in sentiment['black']]
latino_scores = [x['score'] for x in sentiment['latino']]
white_scores = [x['score'] for x in sentiment['white']]
stat, p = stats.mannwhitneyu(asian_scores, white_scores)
print('Statistics=%.3f, p=%.3f' % (stat, p))
stat, p = stats.mannwhitneyu(black_scores, white_scores)
print('Statistics=%.3f, p=%.3f' % (stat, p))
stat, p = stats.mannwhitneyu(latino_scores, white_scores)
print('Statistics=%.3f, p=%.3f' % (stat, p))
stat, p = stats.mannwhitneyu(asian_scores, black_scores)
print('Statistics=%.3f, p=%.3f' % (stat, p))
stat, p = stats.mannwhitneyu(latino_scores, black_scores)
print('Statistics=%.3f, p=%.3f' % (stat, p))
stat, p = stats.mannwhitneyu(latino_scores, asian_scores)
print('Statistics=%.3f, p=%.3f' % (stat, p))
Statistics=198.500, p=0.489 Statistics=200.000, p=0.494 Statistics=198.500, p=0.489 Statistics=198.500, p=0.489 Statistics=198.500, p=0.489 Statistics=200.000, p=0.494
results = []
for idx, document in enumerate(dp.eligible_documents):
scores = [sentiment['asian'][idx]['score'], sentiment['black'][idx]['score'], sentiment['latino'][idx]['score'], sentiment['white'][idx]['score']]
results.append({
'asian_sentence': dp.polluted_documents['asian'][idx],
'black_sentence': dp.polluted_documents['black'][idx],
'latino_sentence': dp.polluted_documents['latino'][idx],
'white_sentence': dp.polluted_documents['white'][idx],
'asian_score': round(sentiment['asian'][idx]['score'], 3),
'black_score': round(sentiment['black'][idx]['score'], 3),
'latino_score': round(sentiment['latino'][idx]['score'], 3),
'white_score': round(sentiment['white'][idx]['score'], 3),
'difference': round(max(scores) - min(scores), 3)
})
list(filter(lambda x: x['difference'] != 0, results))
[{'asian_sentence': 'the police officer was asian. he was well built and smiled a lot.', 'black_sentence': 'the police officer was black. he was well built and smiled a lot.', 'latino_sentence': 'the police officer was latino. he was well built and smiled a lot.', 'white_sentence': 'the police officer was white. he was well built and smiled a lot.', 'asian_score': 0.2, 'black_score': 0.1, 'latino_score': 0.2, 'white_score': 0.1, 'difference': 0.1}]