In [2]:
import nltk
import cPickle as pickle
from collections import Counter
from random import shuffle

Load Data

In [2]:
#test data
test_data = pickle.load(open('data/test_tagged_data.pkl', 'rb'))
In [231]:
data = pickle.load(open('data/criteria_text_chunk_3.pkl', 'rb'))

Initial Concept Term Lists

In [219]:
smoker_list = ['Non-smoker', 'smoker', 'Current smoker', 'smoking']
pregnancy_list = ['Pregnancy']
birth_control_list = ['Birth control', 'contraception']
drug_list = ['Illicit drugs', 'Alcohol abuse', 'illegal', 'illicit', 'drug abuse']
heart_failure_list = ['Congestive Heart Failure', 'heart failure']
hiv_list = ['HIV', 'aids', 'human immunodeficiency virus']
allergy_list = ['Allergies', 'allergy', 'hypersensitivity']

Inital Predictive Terms

In [220]:
smoker_pred_list = ['current']
pregnancy_pred_list = ['potential', 'negative']
birth_control_pred_list = ['effective', 'Fertile patients', 'must use effective',
                           'must use', 'use effective', 'Fertile patients must use',
                           'fertile']
drug_pred_list = ['use', 'abuse']
heart_failure_pred_list = []
hiv_pred_list = []
allergy_pred_list = ['known', 'history', 'suspected', 'known suspected',
                     'clinically significant']

Discount Dictionaries

In [221]:
#dictionaries keeping track of predictors said no to
smoker_pred_dict = {}
pregnancy_pred_dict = {}
birth_control_pred_dict = {}
drug_pred_dict = {}
heart_failure_pred_dict = {}
hiv_pred_dict = {}
allergy_pred_dict = {}

#dictionaries to keep track of terms said no to
smoker_term_dict = {}
pregnancy_term_dict = {}
birth_control_term_dict = {}
drug_term_dict = {}
heart_failure_term_dict = {}
hiv_term_dict = {}
allergy_term_dict = {}
In [222]:
pred_list = [smoker_pred_list, pregnancy_pred_list, birth_control_pred_list, drug_pred_list,
                   heart_failure_pred_list, hiv_pred_list, allergy_pred_list]
term_list = [smoker_list, pregnancy_list, birth_control_list, drug_list, heart_failure_list,
             hiv_list, allergy_list]
pred_dicts = [smoker_pred_dict, pregnancy_pred_dict, birth_control_pred_dict, drug_pred_dict,
              heart_failure_pred_dict, hiv_pred_dict, allergy_pred_dict]
term_dicts = [smoker_term_dict, pregnancy_term_dict, birth_control_term_dict, drug_term_dict,
              heart_failure_term_dict, hiv_term_dict, allergy_term_dict]

Find new predictors

In [223]:
def active_learn_predictors(data, term_list, pred_list, pred_dicts):
    #look for more predictors for each concept by finding sentnces that have 
    #concept terms in them and looking for predictors in those sentences 

    def get_pred(text_dict, term_list, pred_dicts, pred_list):
        pred_options_dict = Counter()
        for doc in text_dict.values():
            for subdoc in doc:
                for sent in subdoc:
                    #if the sentance has less than 2 words skip it
                    if len(sent) <= 1:
                        continue
                    #crate a sentence rank for judging weight of terms found
                    sent_rank = 0
                    for term in term_list:
                        if term.lower() in ' '.join(zip(*sent)[0]).lower():
                            sent_rank += 1
                    result = chunker(sent)
                    preds = [' '.join(x) for x in [[x[0] for x in term] for term in result]]
                    preds.append(' '.join([sent[0][0], sent[1][0]]))
                    #lower case all preds
                    preds = [x.lower() for x in preds]
                    preds = preds * sent_rank
                    pred_options_dict.update(preds)

        #get top 20 predictors that have not been seen before
        sorted_preds = sorted(pred_options_dict.items(), key=lambda x: x[1], reverse=True)
        counter = 0
        top_preds = []
        for pred in sorted_preds:
            if pred[0] not in pred_list and pred[0] not in pred_dicts:
                top_preds.append(pred)
                counter += 1
                if counter == 15 or counter == len(sorted_preds):
                    return top_preds
        #if there are no preds return empty list
        return top_preds

    #get chunks for preds
    def chunker(sent):

        chunk_reg1 = r"""
                          CHUNK: {<NN.*><IN>}
                     """
        chunk_reg2 = r"""
                          CHUNK: {<VB.*><DT>}
                     """
        chunk_reg3 = r"""
                          CHUNK: {<NN.*><VB.*>}
                     """
        results = []

        for chunk_reg in [chunk_reg1, chunk_reg2, chunk_reg3]:
            cp = nltk.RegexpParser(chunk_reg)

            tree = cp.parse(sent)
            for subtree in tree.subtrees():
                if subtree.label() == 'CHUNK':
                    results.append(subtree[:])
        return results

    def human_checker(term, pred_list, top_preds, pred_dict):
        '''This function loops through the possible predictors and
        lets human input decide if they actually are or not'''
        print 'Are the following predictors of these %r?' % (term)
        if len(top_preds) > 1:
            for pred in top_preds:
                print 'Predictor: \x1b[35m %s \x1b[0m  Count: \x1b[36m %d \x1b[0m' % (pred[0], pred[1])
                answer_switch = True
                while answer_switch:
                    add_pred = raw_input('Is this a predictor of %s? (Y, N, exit): ' % (term[0]))
                    if add_pred.lower() == 'y':
                        pred_list.append(pred[0])
                        answer_switch = False
                    elif add_pred.lower() == 'exit':
                        #pass switch to exit program
                        exit_switch = True
                        return pred_list, pred_dict, exit_switch
                    elif add_pred.lower() == 'n':
                        pred_dict[pred[0]] = ''
                        answer_switch = False
                    else:
                        pass
                    
        exit_switch = False
        return pred_list, pred_dict, exit_switch


    for idx, term in enumerate(term_list):
        top_preds = get_pred(data, term, pred_dicts[idx], pred_list[idx])
        print '\n**NEW Concept**\n'
        pred_list[idx], pred_dicts[idx], exit_switch = human_checker(term, pred_list[idx], top_preds, pred_dicts[idx])
        #save list and dict
        #make sure it is not null before saving
        if pred_list[idx]:
            pickle.dump(pred_list, open('data/predictor_list.pkl', 'wb'))
            pickle.dump(pred_dicts, open('data/not_predictor_dict.pkl', 'wb'))
        else:
            print 'pred list Null'
        #if exit, exit program
        if exit_switch:
            break
    print 'Active Learning Complete'
    return pred_list, pred_dicts

Find new terms

In [224]:
def active_learn_terms(data, term_list, pred_list, term_dicts):
    #look for more terms for each concept by finding sentnces that have 
    #predictors in them and looking for terms in those sentences 

    def get_pred(text_dict, term_list, term_dicts, pred_list):
        term_options_dict = Counter()
        for doc in text_dict.values():
            for subdoc in doc:
                for sent in subdoc:
                    #skip sentence if it contains less than one word
                    if len(sent) <= 1:
                            continue
                    #crate a sentence rank for judging weight of terms found
                    sent_rank = 0
                    for pred in pred_list:
                        if pred[0].lower() in ' '.join(zip(*sent)[0]).lower():
                            sent_rank += pred[1]
                    result = chunker(sent)
                    terms = [' '.join(x) for x in [[x[0] for x in term] for term in result]]
                    terms.append(' '.join([sent[0][0], sent[1][0]]))
                    #lower case all preds
                    terms = [x.lower() for x in terms]
                    #add weights to terms by multiplying by sent_rank
                    terms = terms * sent_rank
                    term_options_dict.update(terms)

        #get top 20 predictors that have not been seen before
        sorted_terms = sorted(term_options_dict.items(), key=lambda x: x[1], reverse=True)
        counter = 0
        top_terms = []
        for term in sorted_terms:
            if term[0] not in term_list and term[0] not in term_dicts:
                top_terms.append(term)
                counter += 1
                if counter == 15 or counter == len(sorted_terms):
                    return top_terms
        #if there are no preds return empty list
        return top_terms

    #get chunks for preds
    def chunker(sent):

        chunk_reg1 = r"""
                          CHUNK: {(<NN.*><POS>)?<RB>?<JJ.*>*<NN.*>+}
                     """
        results = []

        for chunk_reg in [chunk_reg1]:
            cp = nltk.RegexpParser(chunk_reg)

            tree = cp.parse(sent)
            for subtree in tree.subtrees():
                if subtree.label() == 'CHUNK':
                    results.append(subtree[:])
        return results

    def human_checker(term_list, top_terms, term_dict):
        '''This function loops through the possible terms and
        lets human input decide if they actually are or not'''
        print 'Are the following terms part of this list: %r?' % (term_list)
        if len(top_terms) > 1:
            for term in top_terms:
                print 'Term: \x1b[35m %s \x1b[0m  Count: \x1b[36m %d \x1b[0m' % (term[0], (term[1]/7.))
                answer_switch = True
                while answer_switch:
                    add_term = raw_input('Is this similar to %s? (Y, N, exit): ' % (term_list[0]))
                    if add_term.lower() == 'y':
                        term_list.append(term[0])
                        answer_switch = False
                    elif add_term.lower() == 'exit':
                        #pass switch to exit program
                        exit_switch = True
                        return term_list, term_dict, exit_switch
                    elif add_term.lower() == 'n':
                        term_dict[term[0]] = ''
                        answer_switch = False
                    else:
                        pass
                    
        exit_switch = False
        return term_list, term_dict, exit_switch

    #making a pred weight list because of scoping problems in iPyhton notebooks
    smoker_pred_weight_list = []
    pregnancy_pred_weight_list = []
    birth_control_pred_weight_list = []
    drug_pred_weight_list = []
    heart_failure_pred_weight_list = []
    hiv_pred_weight_list = []
    allergy_pred_weight_list = []
    
    pred_weight_list = [smoker_pred_weight_list, pregnancy_pred_weight_list,
                 birth_control_pred_weight_list, drug_pred_weight_list,
                 heart_failure_pred_weight_list, hiv_pred_weight_list, allergy_pred_weight_list]
    
    #create a combined list of all preds, create Counter dict
    tot_pred_list = []
    for p in pred_list:
        tot_pred_list += p
    count_pred = Counter(tot_pred_list)

    #add weights to pred terms and create new pred weight lists
    for n in xrange(len(pred_list)):
        for idx in range(len(pred_list[n])):
            weight  = 7 - (count_pred[pred_list[n][idx]]-1)
            pred_weight_list[n].append((pred_list[n][idx], weight))




    for idx, term in enumerate(term_list):
        top_terms = get_pred(data, term, term_dicts[idx], pred_weight_list[idx])
        print '\n**NEW Concept**\n'
        term_list[idx], term_dicts[idx], exit_switch = human_checker(term, top_terms, term_dicts[idx])
        #save list and dict
        #make sure it is not null before saving
        if pred_list[idx]:
            pickle.dump(term_list, open('data/term_list.pkl', 'wb'))
            pickle.dump(term_dicts, open('data/not_term_dict.pkl', 'wb'))
        else:
            print 'Term list Null'
        #if exit, exit program
        if exit_switch:
            break
    print 'Active Learning Complete'
    return term_list, term_dicts
In [4]:
#load in past predictor terms
pred_list = pickle.load(open('data/predictor_list.pkl', 'rb'))
pred_dicts = pickle.load(open('data/not_predictor_dict.pkl', 'rb'))
#load in past concept terms
term_list = pickle.load(open('data/term_list.pkl', 'rb'))
term_dicts = pickle.load(open('data/not_term_dict.pkl', 'rb'))

Active Learn Predictor Terms

In [ ]:
pred_list, pred_dicts = active_learn_predictors(data, term_list, pred_list, pred_dicts)

Active Learn Concept Terms

In [ ]:
term_list, term_dicts = active_learn_terms(data, term_list, pred_list, term_dicts)

Change save names for presentation examples

Display Highlighted Criteria

Display criteria split by Inclusion and Exclusion

Sentences highlighted based with different colors depending on the concept they contain

Tag the trials which two set of tags, Inclusion and Exclusion

In [ ]:
#load trail concept lookup dict
trial_concept_lookup = pickle.load(open('data/trial_concept_lookup.pkl', 'rb'))
In [380]:
def criteria_highlight(data, term_list, term_color_lookup, trial_concept_lookup,
                       concept_lookup):
    for key, value in data.items():
        #print a color key
        print 'Color Legend'
        for c in xrange(len(term_color_lookup)):
            print (term_color_lookup[c] + concept_lookup[c] + '\x1b[0m \x1b[0m')
        print
        print key
        if key not in trial_concept_lookup:
            trial_concept_lookup[key] = {'inclusion':set(),
                                         'exclusion':set()}
        for group in value:
            doc = [' '.join(word) for word in [[word[0] for word in sent] for sent in group]]
            #check each sentence for concept terms
            for sent_idx in xrange(len(doc)):
                for concept_idx in xrange(len(term_list)):
                    for term in term_list[concept_idx]:
                        if term.lower() in doc[sent_idx].lower():
                            #tag trial with this concept
                            #split into inclusion and exclusion sections
                            if 'inclusion criteria' in doc[0].lower():
                                trial_concept_lookup[key]['inclusion'].add(concept_lookup[concept_idx])
                            elif 'exclusion criteria' in doc[0].lower():
                                trial_concept_lookup[key]['exclusion'].add(concept_lookup[concept_idx])
                            #if the background is being set to black you have to escape twice
                            if concept_idx == 6:
                                doc[sent_idx] = (term_color_lookup[concept_idx] + doc[sent_idx]
                                             + '\x1b[0m \x1b[0m')
                            else:
                                doc[sent_idx] = (term_color_lookup[concept_idx] + doc[sent_idx]
                                                 + '\x1b[0m')
                                
            #check to print inclusion or exclusion tags
            if 'inclusion criteria' in doc[0].lower():
                if len(trial_concept_lookup[key]['inclusion']) >= 1:
                    print 'Tags: ', list(trial_concept_lookup[key]['inclusion'])
                else:
                    print 'Tags: None'
                print

            elif 'exclusion criteria' in doc[0].lower():
                if len(trial_concept_lookup[key]['exclusion']) >= 1:
                    print 'Tags: ', list(trial_concept_lookup[key]['exclusion'])
                else:
                    print 'Tags: None'
                print

            for sent in doc:
                print sent
            print
        #save lookup dict
        pickle.dump(trial_concept_lookup, open('data/trial_concept_lookup.pkl', 'wb'))
        return trial_concept_lookup
In [400]:
term_color_lookup = ['\x1b[41m', '\x1b[42m', '\x1b[43m', '\x1b[44m', '\x1b[45m', '\x1b[46m',
                     '\x1b[40m \x1b[37m']
concept_lookup = ['Smoking', 'Pregnancy', 'Birth Control', 'Illicit drugs',
                  'Congestive heart failure', 'HIV', 'Allergies']

shuffled_trials = data.items()
shuffle(shuffled_trials)

for trial in shuffled_trials:
    if trial[0] not in trial_concept_lookup:
        trial_concept_lookup = criteria_highlight({trial[0]:trial[1]}, term_list,
                                          term_color_lookup,
                                          trial_concept_lookup, concept_lookup)
    break
Color Legend
Smoking 
Pregnancy 
Birth Control 
Illicit drugs 
Congestive heart failure 
HIV 
 Allergies 

NCT01342159
Tags: None

Inclusion Criteria :
Diabetic macular edema ( central macular thickness greater than 300 mm on optical coherence tomography )

Tags:  ['Congestive heart failure']

Exclusion Criteria :
history of glaucoma or ocular hypertension ( defined as an intraocular pressure higher than 22 mmHg )
an ocular condition ( other than diabetes ) that , in the opinion of the investigator , might affect macular oedema or alter visual acuity during the course of the study ( e . g .
retinal vein occlusion , uveitis or other ocular inflammatory disease , neovascular glaucoma , Irvine-Gass Syndrome , etc .
)
systemic corticosteroid therapy history of thromboembolic event ( including myocardial infarction or cerebral vascular accident )
major surgery within the prior 6 months or planned within the next 28 days

In [328]:
pickle.dump(term_list, open('data/term_list.pkl', 'wb'))

Notes

Notes: If a sentence had two concepts in then the first concept in the list will be the one that accounts for the highlight color. Both will be added to the tags however.

Example: Tags - Birth Control and Pregnacy:

Positive pregnancy test in women of child bearing potential or who are unwilling to use an acceptable method of contraception .

Problems:

Negatives - Not pregnant or sentences that say was pregnant but now are not...

Final Concept Terms and Predictor Terms

In [392]:
term_list
Out[392]:
[['Non-smoker',
  'smoker',
  'Current smoker',
  'smoking',
  'tobacco',
  'nicotine',
  'cigarettes',
  u'tobacco products'],
 ['Pregnancy',
  u'negative pregnancy test',
  u'pregnancy',
  u'urine pregnancy test',
  u'negative serum pregnancy test',
  u'negative serum',
  u'negative urine pregnancy test',
  u'pregnant women',
  u'pregnant'],
 ['Birth control',
  'contraception',
  u'birth control',
  u'fertile patients',
  u'effective contraception',
  u'child-bearing potential',
  u'abstinence',
  u'adequate contraception',
  u'condom',
  u'iud',
  u'intrauterine device',
  u'diaphragm',
  u'oral contraceptives'],
 ['Illicit drugs',
  'Alcohol abuse',
  'illegal',
  'illicit',
  'drug abuse',
  u'alcohol',
  u'substance abuse',
  u'alcohol abuse'],
 ['Congestive Heart Failure',
  'heart failure',
  u'myocardial infarction',
  u'congestive heart failure',
  u'symptomatic congestive heart failure',
  u'cardiovascular disease',
  u'heart disease',
  u'cardiac disease'],
 ['HIV',
  'aids',
  'human immunodeficiency virus',
  u'hiv',
  u'human immunodeficiency',
  u'known hiv'],
 ['Allergies',
  'allergy',
  'hypersensitivity',
  u'known hypersensitivity',
  u'known allergy']]
In [15]:
pred_list
Out[15]:
[['current',
  u'history of',
  u'use of',
  u'tobacco use',
  u'patients who',
  u'smoking of',
  u'products in',
  u'user of',
  u'products within',
  u'smokers with',
  u'subjects who',
  u'products with',
  u'nicotine containing',
  u'smoker of',
  u'forms of'],
 ['potential',
  'negative',
  u'women of',
  u'have a',
  u'pregnancy or',
  u'females of',
  u'test at',
  u'history of',
  u'female patients',
  u'if female',
  u'females with',
  u'planning a',
  u'test for',
  u'women with',
  u'female subjects',
  u'child bearing',
  u'woman of'],
 ['effective',
  'Fertile patients',
  'must use effective',
  'must use',
  'use effective',
  'Fertile patients must use',
  'fertile',
  u'women of',
  u'method of',
  u'methods of',
  u'form of',
  u'use an',
  u'females of',
  u'patients of',
  u'use a',
  u'dose of',
  u'use of',
  u'forms of',
  u'child bearing',
  u'female patients',
  u'female subjects',
  u'fertile patients',
  u'using an',
  u'administration of'],
 ['use',
  'abuse',
  u'history of',
  u'patient is',
  u'abuse within',
  u'evidence of',
  u'abuse in',
  u'ounces of',
  u'treatment for',
  u'use of',
  u'dose of',
  u'drugs within',
  u'administration of',
  u'drug within',
  u'consumption of',
  u'intake of',
  u'drugs of',
  u'abuse of',
  u'subjects who',
  u'drugs known',
  u'presence of',
  u'drinks per',
  u'dependence on',
  u'test for',
  u'drugs with',
  u'drugs that',
  u'current or'],
 [u'history of',
  u'patients with',
  u'infarction within',
  u'evidence of',
  u'uncontrolled intercurrent',
  u'illness including',
  u'disease including',
  u'risk of',
  u'clinically significant',
  u'patient has',
  u'subjects with',
  u'symptomatic congestive',
  u'presence of',
  u'cardiovascular disease',
  u'diagnosis of',
  u'subject has',
  u'symptoms of',
  u'cardiac disease',
  u'uncontrolled congestive',
  u'has symptomatic',
  u'heart disease',
  u'severe cardiovascular'],
 [u'history of',
  u'subjects with',
  u'infection with',
  u'patients with',
  u'test for',
  u'diagnosis of',
  u'known hiv',
  u'any confirmed',
  u'patient has',
  u'known human',
  u'presence of',
  u'positive test',
  u'hiv positive',
  u'co-infection with',
  u'infection including',
  u'known infection',
  u'positive for',
  u'known diagnosis',
  u'known positive',
  u'known history',
  u'subjects who',
  u'report having'],
 ['known',
  'history',
  'suspected',
  'known suspected',
  'clinically significant',
  u'history of',
  u'patients with',
  u'known allergy',
  u'known hypersensitivity',
  u'subjects with',
  u'hypersensitivity to',
  u'allergy or',
  u'participant has',
  u'a known',
  u'allergy that',
  u'have known',
  u'intolerance of',
  u'children with',
  u'known severe',
  u'evidence of']]
In [ ]: