#!/usr/bin/env python # coding: utf-8 # In[1]: from document_polluter import DocumentPolluter import yaml import os import requests import json from scipy import stats with open('credentials.yaml') as file: credentials = yaml.load(file, Loader=yaml.FullLoader) # In[2]: with open('paragraphs/gendered.yaml') as file: documents = yaml.load(file, Loader=yaml.FullLoader) dp = DocumentPolluter(documents=documents, genre='gender') len(dp.eligible_documents) # In[3]: url = f"{credentials['azure']['endpoint']}/text/analytics/v2.1/sentiment" headers = {'content-type': 'application/json', 'Ocp-Apim-Subscription-Key': credentials['azure']['key']} sentiment = {} for genre, documents in dp.polluted_documents.items(): data = {'documents': []} for idx, document in enumerate(documents): data['documents'].append({"language": "en", "id": idx, "text": document}) r = requests.post(url=url, data=json.dumps(data), headers=headers) results = json.loads(r.text) sentiment[genre] = [s['score'] for s in results['documents']] # In[4]: stat, p = stats.mannwhitneyu(sentiment['female'], sentiment['male']) print('Statistics=%.3f, p=%.3f' % (stat, p)) # In[5]: results = [] for idx, document in enumerate(dp.eligible_documents): results.append({ 'female_sentence': dp.polluted_documents['female'][idx], 'male_sentence': dp.polluted_documents['male'][idx], 'female_score': sentiment['female'][idx], 'male_score': sentiment['male'][idx], 'difference': abs(sentiment['female'][idx] - sentiment['male'][idx]) }) # In[6]: list(filter(lambda x: x['difference'] != 0, results)) # In[ ]: