import os
import glob
import re
import io
from PIL import Image
from panoptes_client import Panoptes, Project, SubjectSet, Subject
from panoptes_client.exportable import Exportable
from credentials import ZOONIVERSE_USER, ZOONIVERSE_PW
import pandas as pd
from pandas.io.json import json_normalize
import json
import arrow
Panoptes.connect(username=ZOONIVERSE_USER, password=ZOONIVERSE_PW)
<panoptes_client.panoptes.Panoptes at 0x7f62e06437f0>
Should add a step here to generate a new report.
classification_export = Project(8678).get_export('classifications')
df = pd.read_csv(io.BytesIO(classification_export.content))
df.head()
classification_id | user_name | user_id | user_ip | workflow_id | workflow_name | workflow_version | created_at | gold_standard | expert | metadata | annotations | subject_data | subject_ids | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 152228384 | wragge | 299993 | f56860099175406887f7 | 9738 | Transcribe | 5.60 | 2019-03-13 06:24:43 UTC | NaN | NaN | {"source":"api","session":"e0d698612e971be2c63... | [{"task":"T0","value":"Potters Sulphide Ore Tr... | {"31216279":{"retired":null,"row":"32","page":... | 31216279 |
1 | 152456334 | wragge | 299993 | 0d91b037230b486aa157 | 9738 | Transcribe | 10.80 | 2019-03-14 01:20:18 UTC | NaN | NaN | {"source":"api","session":"4f36768a74d53502e38... | [{"task":"T0","value":"","task_label":"What's ... | {"31216275":{"retired":null,"row":"29","page":... | 31216275 |
2 | 174876307 | wragge | 299993 | 29287f669dd4ade1c4fb | 11577 | Transcribe dates | 23.25 | 2019-07-17 07:19:39 UTC | NaN | NaN | {"source":"api","session":"f9ba2ebc36b589350fc... | [{"task":"T0","task_label":"Session?","value":... | {"35123970":{"retired":null,"year":1929,"image... | 35123970 |
3 | 174876535 | wragge | 299993 | 29287f669dd4ade1c4fb | 11577 | Transcribe dates | 27.27 | 2019-07-17 07:22:44 UTC | NaN | NaN | {"source":"api","session":"23df4ddbc2756f8ab3d... | [{"task":"T4","task_label":"Can you see a date... | {"35124669":{"retired":null,"year":1929,"image... | 35124669 |
4 | 174876842 | wragge | 299993 | 29287f669dd4ade1c4fb | 11577 | Transcribe dates | 45.45 | 2019-07-17 07:26:17 UTC | NaN | NaN | {"source":"api","session":"54ade2eb34a577ebae4... | [{"task":"T4","task_label":"Can you see a date... | {"35125039":{"retired":null,"year":1929,"image... | 35125039 |
The annotations
and subjects
data is nested.
json.loads(df.loc[df['workflow_id'] == 11577].iloc[10]['annotations'])
[{'task': 'T4', 'task_label': 'Can you see a handwritten date?', 'value': 'Yes'}, {'task': 'T0', 'task_label': 'Can you identify the session? Look for one of the following handwritten headings.', 'value': 'No session heading'}, {'task': 'T5', 'task_label': 'What is the day of the week?', 'value': 'Saturday'}, {'task': 'T1', 'value': '29', 'task_label': 'What is the day of the month? This should be a number between 1 and 31.'}, {'task': 'T2', 'task_label': 'What is the month?', 'value': 'June'}, {'task': 'T3', 'value': '1929', 'task_label': 'What is the year?'}]
def list_of_dicts(ld):
'''
Create a mapping of the tuples formed after
converting json strings of list to a python list
'''
return dict([(d['task'], d['value']) for d in json.loads(ld)])
annotations = json_normalize(df['annotations'].apply(list_of_dicts).to_list())
subjects = json_normalize(df['subject_data'].apply(lambda x: list(json.loads(x).values())[0]).to_list())
annotations.head()
T0 | T1 | T2 | T3 | T4 | T5 | |
---|---|---|---|---|---|---|
0 | Potters Sulphide Ore Treatm't | NaN | NaN | NaN | NaN | NaN |
1 | NaN | NaN | ||||
2 | Morning | 12 | NaN | NaN | NaN | NaN |
3 | Morning | 14 | January | 1929 | Yes | NaN |
4 | Noon | 3 | May | 1929 | Yes | NaN |
subjects.head()
column | image | page | page_number | retired | row | volume | volume_number | year | |
---|---|---|---|---|---|---|---|---|---|
0 | 2 | N193-022_0184-col-2-32.jpg | 0184 | NaN | None | 32 | N193-022 | NaN | NaN |
1 | 2 | N193-022_0184-col-2-29.jpg | 0184 | NaN | None | 29 | N193-022 | NaN | NaN |
2 | NaN | N193-115_0158-header.jpg | NaN | 158.0 | None | NaN | N193-115 | 115.0 | 1929.0 |
3 | NaN | N193-113_0028-header.jpg | NaN | 28.0 | None | NaN | N193-113 | 113.0 | 1929.0 |
4 | NaN | N193-114_0110-header.jpg | NaN | 110.0 | None | NaN | N193-114 | 114.0 | 1929.0 |
df_expanded = df[['user_name', 'user_id', 'workflow_id', 'workflow_name', 'created_at', 'gold_standard', 'expert', 'subject_ids']].join([annotations, subjects])
df_headers = df_expanded.loc[df_expanded['workflow_id'] == 11577]
df_headers.head()
user_name | user_id | workflow_id | workflow_name | created_at | gold_standard | expert | subject_ids | T0 | T1 | ... | T5 | column | image | page | page_number | retired | row | volume | volume_number | year | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
2 | wragge | 299993 | 11577 | Transcribe dates | 2019-07-17 07:19:39 UTC | NaN | NaN | 35123970 | Morning | 12 | ... | NaN | NaN | N193-115_0158-header.jpg | NaN | 158.0 | None | NaN | N193-115 | 115.0 | 1929.0 |
3 | wragge | 299993 | 11577 | Transcribe dates | 2019-07-17 07:22:44 UTC | NaN | NaN | 35124669 | Morning | 14 | ... | NaN | NaN | N193-113_0028-header.jpg | NaN | 28.0 | None | NaN | N193-113 | 113.0 | 1929.0 |
4 | wragge | 299993 | 11577 | Transcribe dates | 2019-07-17 07:26:17 UTC | NaN | NaN | 35125039 | Noon | 3 | ... | NaN | NaN | N193-114_0110-header.jpg | NaN | 110.0 | None | NaN | N193-114 | 114.0 | 1929.0 |
5 | wragge | 299993 | 11577 | Transcribe dates | 2019-07-22 23:46:08 UTC | NaN | NaN | 35124594 | Afternoon | 8 | ... | NaN | NaN | N193-116_0149-header.jpg | NaN | 149.0 | None | NaN | N193-116 | 116.0 | 1929.0 |
6 | wragge | 299993 | 11577 | Transcribe dates | 2019-07-22 23:58:35 UTC | NaN | NaN | 35123979 | Noon | 4 | ... | Monday | NaN | N193-116_0127-header.jpg | NaN | 127.0 | None | NaN | N193-116 | 116.0 | 1929.0 |
5 rows × 23 columns
df_headers['T0'].value_counts()
Afternoon 299 Noon 282 Morning 135 No session heading 72 Name: T0, dtype: int64
df_headers['T1'].value_counts()
17 31 10 30 11 30 12 30 18 28 4 26 9 23 1 23 7 23 20 22 5 22 21 22 26 22 29 21 14 20 8 20 28 20 15 20 6 19 30 19 27 18 25 18 22 18 23 17 16 16 3 16 2 15 19 15 13 14 31 13 .. 26th 9 2nd 7 11th 7 13th 6 12th 6 28th 6 30th 5 5th 5 7th 5 16th 5 [unclear][/unclear] 5 10th 5 21st 5 23rd 5 18th 4 22nd 4 6th 4 27th 3 14th 3 9th 3 20th 3 8th 3 24th 2 25th 2 19th 2 29th 2 3rd 2 17th 2 1st 1 1 Name: T1, Length: 63, dtype: int64
df_headers['T2'].value_counts()
May 90 July 74 September 72 August 71 November 68 June 65 April 62 March 61 October 61 January 61 December 51 February 49 Name: T2, dtype: int64
df_headers['T3'].value_counts()
1929 403 1930 370 1989 3 1919 2 17 1 [unclear][/unclear] 1 1030 1 12 1 1921 1 1979 1 1920 1 Name: T3, dtype: int64
df_headers['T4'].value_counts()
Yes 787 No 1 Name: T4, dtype: int64
df_headers['T5'].value_counts()
Tuesday 150 Friday 149 Wednesday 143 Thursday 142 Monday 127 Saturday 71 Not given or unclear 1 Name: T5, dtype: int64
df_headers.shape
(797, 23)
# Filter columns
df_headers = df_headers[['user_name', 'created_at', 'subject_ids', 'T0', 'T1', 'T2', 'T3', 'T4', 'T5', 'image', 'year']]
Remove ordinals from transcribed day values.
# I'm using .loc here to avoid the SetCopy warnings
# Note expand=False to return a series rather than a df
df_headers.loc[:, 'T1n'] = df_headers.loc[:, 'T1'].str.extract(r'(^\d+)', expand=False)
df_headers['T1n'].value_counts()
11 37 4 36 12 36 10 35 17 33 18 32 26 31 15 30 7 28 21 27 5 27 9 26 28 26 20 25 30 24 1 24 29 23 14 23 8 23 6 23 2 22 23 22 22 22 16 21 27 21 13 20 25 20 3 18 19 17 24 15 31 13 Name: T1n, dtype: int64
import datetime
import calendar
# T1n days
# T2 months
# T3 years
def create_date(row):
# Convert month name into an index value between 1 and 12
month = list(calendar.month_name).index(row['T2'])
try:
# Assemble datetime from the parts
date = datetime.datetime(int(row['T3']), month, int(row['T1n']))
except ValueError:
date = None
return date
df_headers.loc[:, 'date'] = df_headers.dropna(subset=['T1n', 'T2', 'T3']).apply(create_date, axis=1)
# Convert the column to a date type
# errors = 'coerce' will turn not date-like values into NaT
df_headers.loc[:, 'date'] = pd.to_datetime(df_headers.loc[:, 'date'], errors='coerce')
df_headers.loc[df_headers['date'].isnull()].style.applymap(lambda x: 'background-color: yellow', subset=pd.IndexSlice[:, ['T1', 'T2', 'T3']])
user_name | created_at | subject_ids | T0 | T1 | T2 | T3 | T4 | T5 | image | year | T1n | date | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
2 | wragge | 2019-07-17 07:19:39 UTC | 35123970 | Morning | 12 | nan | nan | nan | nan | N193-115_0158-header.jpg | 1929 | 12 | NaT |
142 | Moegami | 2019-08-16 00:59:37 UTC | 35381824 | nan | nan | nan | nan | No | nan | N193-120_0142-header.jpg | 1930 | nan | NaT |
228 | Walker.C | 2019-08-16 02:05:45 UTC | 35380988 | nan | nan | nan | nan | None | nan | N193-119_0338-header.jpg | 1930 | nan | NaT |
241 | Walker.C | 2019-08-16 02:09:23 UTC | 35124292 | Noon | [unclear][/unclear] | October | 1929 | Yes | Tuesday | N193-116_0078-header.jpg | 1929 | nan | NaT |
263 | Walker.C | 2019-08-16 02:21:38 UTC | 35382123 | Afternoon | nan | nan | nan | Yes | Tuesday | N193-117_0296-header.jpg | 1930 | nan | NaT |
336 | Walker.C | 2019-08-16 02:50:38 UTC | 35124942 | nan | nan | nan | nan | None | nan | N193-115_0289-header.jpg | 1929 | nan | NaT |
343 | Walker.C | 2019-08-16 02:53:36 UTC | 35124053 | Noon | [unclear][/unclear] | July | 1929 | Yes | Tuesday | N193-115_0089-header.jpg | 1929 | nan | NaT |
373 | Moegami | 2019-08-16 03:34:43 UTC | 35124840 | nan | nan | nan | nan | None | nan | N193-115_0219-header.jpg | 1929 | nan | NaT |
375 | Moegami | 2019-08-16 03:35:18 UTC | 35381766 | nan | nan | nan | nan | None | nan | N193-117_0026-header.jpg | 1930 | nan | NaT |
380 | Moegami | 2019-08-16 03:52:07 UTC | 35125020 | nan | nan | nan | nan | None | nan | N193-114_0191-header.jpg | 1929 | nan | NaT |
437 | Moegami | 2019-08-16 04:46:56 UTC | 35381705 | Afternoon | 21st | October | 1030 | Yes | Tuesday | N193-120_0243-header.jpg | 1930 | 21 | NaT |
476 | Moegami | 2019-08-16 05:52:26 UTC | 35381873 | Afternoon | October | 1930 | Yes | Friday | N193-120_0255-header.jpg | 1930 | nan | NaT | |
501 | Moegami | 2019-08-16 06:23:49 UTC | 35125082 | nan | nan | nan | nan | None | nan | N193-116_0263-header.jpg | 1929 | nan | NaT |
511 | Walker.C | 2019-08-20 03:10:35 UTC | 35124596 | Morning | 27 | November | [unclear][/unclear] | Yes | Wednesday | N193-116_0217-header.jpg | 1929 | 27 | NaT |
522 | Walker.C | 2019-08-20 03:16:10 UTC | 35381004 | Afternoon | 12 | September | 12 | Yes | Friday | N193-119_0286-header.jpg | 1930 | 12 | NaT |
534 | Walker.C | 2019-08-20 22:50:05 UTC | 35381169 | Noon | 17 | April | 17 | Yes | Thursday | N193-118_0066-header.jpg | 1930 | 17 | NaT |
543 | Moegami | 2019-08-21 05:22:46 UTC | 35125082 | nan | nan | nan | nan | None | nan | N193-116_0263-header.jpg | 1929 | nan | NaT |
688 | Walker.C | 2019-08-28 03:25:57 UTC | 35381364 | No session heading | [unclear][/unclear] | January | 1930 | Yes | Saturday | N193-117_0080-header.jpg | 1930 | nan | NaT |
716 | Walker.C | 2019-08-28 03:41:41 UTC | 35382081 | nan | nan | nan | nan | None | nan | N193-117_0275-header.jpg | 1930 | nan | NaT |
762 | Walker.C | 2019-08-28 04:08:59 UTC | 35124835 | Afternoon | [unclear][/unclear] | May | 1929 | Yes | Friday | N193-114_0166-header.jpg | 1929 | nan | NaT |
776 | Walker.C | 2019-08-28 04:17:51 UTC | 35381848 | No session heading | [unclear][/unclear] | January | 1930 | Yes | Saturday | N193-117_0081-header.jpg | 1930 | nan | NaT |
823 | Walker.C | 2019-08-28 04:48:34 UTC | 35381003 | Noon | nan | nan | nan | Yes | None | N193-118_0106-header.jpg | 1930 | nan | NaT |
df_headers.loc[df_headers['T3'] != df_headers['year'].astype('int').astype('str')].style.applymap(lambda x: 'background-color: yellow', subset=pd.IndexSlice[:, ['T3','year']])
user_name | created_at | subject_ids | T0 | T1 | T2 | T3 | T4 | T5 | image | year | T1n | date | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
2 | wragge | 2019-07-17 07:19:39 UTC | 35123970 | Morning | 12 | nan | nan | nan | nan | N193-115_0158-header.jpg | 1929 | 12 | NaT |
142 | Moegami | 2019-08-16 00:59:37 UTC | 35381824 | nan | nan | nan | nan | No | nan | N193-120_0142-header.jpg | 1930 | nan | NaT |
143 | Moegami | 2019-08-16 01:00:54 UTC | 35124810 | Afternoon | 16 | May | 1919 | Yes | Thursday | N193-114_0161-header.jpg | 1929 | 16 | 1919-05-16 00:00:00 |
145 | Moegami | 2019-08-16 01:02:03 UTC | 35124124 | Noon | 11 | November | 1989 | Yes | Monday | N193-116_0154-header.jpg | 1929 | 11 | 1989-11-11 00:00:00 |
148 | Walker.C | 2019-08-16 01:05:47 UTC | 35124297 | Afternoon | 14 | August | 1920 | Yes | Wednesday | N193-115_0171-header.jpg | 1929 | 14 | 1920-08-14 00:00:00 |
152 | Moegami | 2019-08-16 01:10:24 UTC | 35124926 | Afternoon | 16 | August | 1919 | Yes | Friday | N193-115_0182-header.jpg | 1929 | 16 | 1919-08-16 00:00:00 |
158 | Moegami | 2019-08-16 01:12:49 UTC | 35123865 | Noon | 24 | May | 1989 | Yes | Friday | N193-114_0192-header.jpg | 1929 | 24 | 1989-05-24 00:00:00 |
161 | Moegami | 2019-08-16 01:15:19 UTC | 35125138 | No session heading | 21 | August | 1979 | Yes | Wednesday | N193-115_0196-header.jpg | 1929 | 21 | 1979-08-21 00:00:00 |
185 | Moegami | 2019-08-16 01:31:18 UTC | 35124329 | Morning | 24 | October | 1989 | Yes | Thursday | N193-116_0087-header.jpg | 1929 | 24 | 1989-10-24 00:00:00 |
228 | Walker.C | 2019-08-16 02:05:45 UTC | 35380988 | nan | nan | nan | nan | None | nan | N193-119_0338-header.jpg | 1930 | nan | NaT |
263 | Walker.C | 2019-08-16 02:21:38 UTC | 35382123 | Afternoon | nan | nan | nan | Yes | Tuesday | N193-117_0296-header.jpg | 1930 | nan | NaT |
336 | Walker.C | 2019-08-16 02:50:38 UTC | 35124942 | nan | nan | nan | nan | None | nan | N193-115_0289-header.jpg | 1929 | nan | NaT |
373 | Moegami | 2019-08-16 03:34:43 UTC | 35124840 | nan | nan | nan | nan | None | nan | N193-115_0219-header.jpg | 1929 | nan | NaT |
375 | Moegami | 2019-08-16 03:35:18 UTC | 35381766 | nan | nan | nan | nan | None | nan | N193-117_0026-header.jpg | 1930 | nan | NaT |
380 | Moegami | 2019-08-16 03:52:07 UTC | 35125020 | nan | nan | nan | nan | None | nan | N193-114_0191-header.jpg | 1929 | nan | NaT |
437 | Moegami | 2019-08-16 04:46:56 UTC | 35381705 | Afternoon | 21st | October | 1030 | Yes | Tuesday | N193-120_0243-header.jpg | 1930 | 21 | NaT |
501 | Moegami | 2019-08-16 06:23:49 UTC | 35125082 | nan | nan | nan | nan | None | nan | N193-116_0263-header.jpg | 1929 | nan | NaT |
511 | Walker.C | 2019-08-20 03:10:35 UTC | 35124596 | Morning | 27 | November | [unclear][/unclear] | Yes | Wednesday | N193-116_0217-header.jpg | 1929 | 27 | NaT |
522 | Walker.C | 2019-08-20 03:16:10 UTC | 35381004 | Afternoon | 12 | September | 12 | Yes | Friday | N193-119_0286-header.jpg | 1930 | 12 | NaT |
534 | Walker.C | 2019-08-20 22:50:05 UTC | 35381169 | Noon | 17 | April | 17 | Yes | Thursday | N193-118_0066-header.jpg | 1930 | 17 | NaT |
543 | Moegami | 2019-08-21 05:22:46 UTC | 35125082 | nan | nan | nan | nan | None | nan | N193-116_0263-header.jpg | 1929 | nan | NaT |
560 | Moegami | 2019-08-21 23:09:16 UTC | 35124319 | Morning | 18 | December | 1921 | Yes | Wednesday | N193-116_0298-header.jpg | 1929 | 18 | 1921-12-18 00:00:00 |
632 | Moegami | 2019-08-23 06:51:17 UTC | 35381048 | Afternoon | 6 | January | 1929 | Yes | Monday | N193-117_0004-header.jpg | 1930 | 6 | 1929-01-06 00:00:00 |
716 | Walker.C | 2019-08-28 03:41:41 UTC | 35382081 | nan | nan | nan | nan | None | nan | N193-117_0275-header.jpg | 1930 | nan | NaT |
823 | Walker.C | 2019-08-28 04:48:34 UTC | 35381003 | Noon | nan | nan | nan | Yes | None | N193-118_0106-header.jpg | 1930 | nan | NaT |
Do the transcribed values for days of the week match the date?
# A list of days of the week
days = list(calendar.day_name)
# Get transcriptions that have a date and a day of the week
df_days = df_headers.loc[(df_headers['date'].notnull()) & (df_headers['T5'].notnull()) & (df_headers['T5'] != 'Not given or unclear')]
# Show days where they don't match
df_days.loc[df_days.apply(lambda x: x['date'].dayofweek != days.index(x['T5']), axis=1)].style.applymap(lambda x: 'background-color: yellow', subset=pd.IndexSlice[:, ['T5','date']])
user_name | created_at | subject_ids | T0 | T1 | T2 | T3 | T4 | T5 | image | year | T1n | date | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
116 | ErinGallant | 2019-07-29 02:33:26 UTC | 35124664 | Noon | 31 | May | 1929 | Yes | Thursday | N193-114_0219-header.jpg | 1929 | 31 | 1929-05-31 00:00:00 |
143 | Moegami | 2019-08-16 01:00:54 UTC | 35124810 | Afternoon | 16 | May | 1919 | Yes | Thursday | N193-114_0161-header.jpg | 1929 | 16 | 1919-05-16 00:00:00 |
145 | Moegami | 2019-08-16 01:02:03 UTC | 35124124 | Noon | 11 | November | 1989 | Yes | Monday | N193-116_0154-header.jpg | 1929 | 11 | 1989-11-11 00:00:00 |
148 | Walker.C | 2019-08-16 01:05:47 UTC | 35124297 | Afternoon | 14 | August | 1920 | Yes | Wednesday | N193-115_0171-header.jpg | 1929 | 14 | 1920-08-14 00:00:00 |
152 | Moegami | 2019-08-16 01:10:24 UTC | 35124926 | Afternoon | 16 | August | 1919 | Yes | Friday | N193-115_0182-header.jpg | 1929 | 16 | 1919-08-16 00:00:00 |
158 | Moegami | 2019-08-16 01:12:49 UTC | 35123865 | Noon | 24 | May | 1989 | Yes | Friday | N193-114_0192-header.jpg | 1929 | 24 | 1989-05-24 00:00:00 |
161 | Moegami | 2019-08-16 01:15:19 UTC | 35125138 | No session heading | 21 | August | 1979 | Yes | Wednesday | N193-115_0196-header.jpg | 1929 | 21 | 1979-08-21 00:00:00 |
162 | Walker.C | 2019-08-16 01:16:15 UTC | 35381789 | Morning | 17 | February | 1930 | Yes | Thursday | N193-117_0198-header.jpg | 1930 | 17 | 1930-02-17 00:00:00 |
185 | Moegami | 2019-08-16 01:31:18 UTC | 35124329 | Morning | 24 | October | 1989 | Yes | Thursday | N193-116_0087-header.jpg | 1929 | 24 | 1989-10-24 00:00:00 |
330 | Moegami | 2019-08-16 02:49:12 UTC | 35124177 | No session heading | 29th | July | 1929 | Yes | Saturday | N193-115_0107-header.jpg | 1929 | 29 | 1929-07-29 00:00:00 |
396 | Moegami | 2019-08-16 04:20:10 UTC | 35381385 | Noon | 4th | March | 1930 | Yes | Friday | N193-117_0259-header.jpg | 1930 | 4 | 1930-03-04 00:00:00 |
399 | ErinGallant | 2019-08-16 04:20:44 UTC | 35123851 | Afternoon | 8 | October | 1929 | Yes | Friday | N193-116_0027-header.jpg | 1929 | 8 | 1929-10-08 00:00:00 |
463 | Moegami | 2019-08-16 05:20:42 UTC | 35124029 | Afternoon | 16th | March | 1929 | Yes | Tuesday | N193-113_0301-header.jpg | 1929 | 16 | 1929-03-16 00:00:00 |
471 | Moegami | 2019-08-16 05:32:37 UTC | 35381931 | Morning | 10th | May | 1930 | Yes | Tuesday | N193-118_0100-header.jpg | 1930 | 10 | 1930-05-10 00:00:00 |
512 | Walker.C | 2019-08-20 03:11:03 UTC | 35123875 | Afternoon | 6 | June | 1929 | Yes | Wednesday | N193-114_0233-header.jpg | 1929 | 6 | 1929-06-06 00:00:00 |
560 | Moegami | 2019-08-21 23:09:16 UTC | 35124319 | Morning | 18 | December | 1921 | Yes | Wednesday | N193-116_0298-header.jpg | 1929 | 18 | 1921-12-18 00:00:00 |
632 | Moegami | 2019-08-23 06:51:17 UTC | 35381048 | Afternoon | 6 | January | 1929 | Yes | Monday | N193-117_0004-header.jpg | 1930 | 6 | 1929-01-06 00:00:00 |
644 | Moegami | 2019-08-23 06:59:20 UTC | 35123969 | Afternoon | 7 | November | 1929 | Yes | Tuesday | N193-116_0145-header.jpg | 1929 | 7 | 1929-11-07 00:00:00 |
648 | Moegami | 2019-08-26 06:03:31 UTC | 35124886 | Morning | 20 | August | 1929 | Yes | Friday | N193-115_0232-header.jpg | 1929 | 20 | 1929-08-20 00:00:00 |
681 | Walker.C | 2019-08-28 03:21:54 UTC | 35124664 | Noon | 31 | May | 1929 | Yes | Thursday | N193-114_0219-header.jpg | 1929 | 31 | 1929-05-31 00:00:00 |
753 | Walker.C | 2019-08-28 04:04:46 UTC | 35124994 | Morning | 29 | May | 1929 | Yes | Thursday | N193-114_0207-header.jpg | 1929 | 29 | 1929-05-29 00:00:00 |
825 | Walker.C | 2019-08-28 04:49:20 UTC | 35381901 | Noon | 17 | August | 1930 | Yes | Thursday | N193-119_0143-header.jpg | 1930 | 17 | 1930-08-17 00:00:00 |
datetime.datetime(1930, 8, 17).weekday()
6
days.index('Thursday')
3
from math import ceil
def week_of_month(dt):
""" Returns the week of the month for the specified date.
"""
first_day = dt.replace(day=1)
dom = dt.day
adjusted_dom = dom + first_day.weekday()
return int(ceil(adjusted_dom/7.0))
df_headers.loc[:, 'week'] = df_headers.loc[(df_headers['date'].notnull())].loc[:, 'date'].apply(week_of_month)
Remove duplicates (see below)
df_headers.drop_duplicates(subset=['image', 'date'], inplace=True)
import altair as alt
alt.renderers.enable('default')
RendererRegistry.enable('default')
pointer = alt.selection_single(fields=['date'])
color = alt.condition(pointer,
alt.Color('count():Q'),
alt.value('lightgray'))
year = alt.Chart(df_headers.loc[(df_headers['date'].notnull()) & (df_headers['date'].dt.year == 1929)]).mark_rect().encode(
x=alt.X('day(date):O', title='day'),
y='week:O',
facet=alt.Facet('month(date):T', title=''),
color=color,
tooltip=['date:T', 'count():Q']
).properties(columns=4, title='1929').add_selection(pointer)
from altair import datum, expr
day = alt.Chart(df_headers.loc[(df_headers['date'].notnull())]).mark_bar().encode(
x='T0:N',
y='count():Q',
color=alt.Color('T0:N', legend=None),
tooltip=['date:T'],
column=alt.Column('year(date):T', title=None, header=alt.Header(format=' '))
).transform_filter(pointer).properties(title='Pages per session')
text = alt.Chart(df_headers.loc[(df_headers['date'].notnull())]).mark_text().encode(
text=alt.Text('min(date):N')
).transform_filter(pointer)
(year|day)
alt.Chart(df_headers.loc[(df_headers['date'].notnull())]).mark_rect().encode(
x='date(date):O',
y='month(date):O',
color='count():Q',
facet='year(date):O'
).properties(columns=1)
27 June 1929 has 3 afternoon pages, this seems odd. What's going on?
df_headers.loc[df_headers['date'] == '1929-06-27']
user_name | created_at | subject_ids | T0 | T1 | T2 | T3 | T4 | T5 | image | year | T1n | date | week | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
305 | Walker.C | 2019-08-16 02:43:29 UTC | 35124557 | Afternoon | 27 | June | 1929 | Yes | Thursday | N193-114_0319-header.jpg | 1929.0 | 27 | 1929-06-27 | 5.0 |
319 | Walker.C | 2019-08-16 02:46:31 UTC | 35124097 | Morning | 27 | June | 1929 | Yes | Thursday | N193-114_0315-header.jpg | 1929.0 | 27 | 1929-06-27 | 5.0 |
498 | Moegami | 2019-08-16 06:15:09 UTC | 35124045 | Afternoon | 27th | June | 1929 | Yes | Thursday | N193-114_0318-header.jpg | 1929.0 | 27 | 1929-06-27 | 5.0 |
552 | Moegami | 2019-08-21 05:51:36 UTC | 35124379 | Noon | 27 | June | 1929 | Yes | Thursday | N193-114_0317-header.jpg | 1929.0 | 27 | 1929-06-27 | 5.0 |
Looking at the image names you can see that two of them are duplicates. This should happen in gold standard mode, but transcribers haven't been using this consistently.
It's easy enough to remove duplicates at this stage, but for non gold standard transcribing, have to aggregate the results and identify conflicts.
Can we fill in the gaps? Looking at the beginning and end of 1929, it seems the number of pages for each session on a weekday are consistent:
So 5 pages per weekday. Saturdays have 2 pages.
Can I predict the number of pages in a volume?
Here I did a lot of investigation of holidays in NSW, and created a CSV with holiday data.
df_series = pd.read_csv('series_list.csv')
all_holidays = pd.read_csv('nsw_holidays_1900_1950.csv')
#all_holidays.loc[:, 'date'] = pd.to_datetime(all_holidays.loc[:, 'date'], errors='coerce')
def get_holidays(year):
holidays = all_holidays.loc[all_holidays['year'] == year]['date']
return holidays.to_list()
def daterange(start_date, end_date):
for n in range(int ((end_date - start_date).days)):
yield start_date.shift(days=+n)
def predict_pages(start_date, end_date, weekday_pages=5, saturday_pages=2, include_saturday=True):
pages = 0
year = start_date.year
holidays = sorted(get_holidays(year))
for single_date in daterange(start_date, end_date):
if single_date.format('YYYY-MM-DD') not in holidays and single_date.weekday() != 6:
if single_date.weekday() == 5:
if include_saturday is True:
pages += saturday_pages
else:
pages += weekday_pages
return pages
vols = [
{
'id': '113',
'start_date': datetime.date(1929, 1, 7),
'end_date': datetime.date(1929, 3, 28),
'pages': 305
},
{
'id': '114',
'start_date': datetime.date(1929, 4, 4),
'end_date': datetime.date(1929, 6, 30),
'pages': 326
},
{
'id': '115',
'start_date': datetime.date(1929, 7, 1),
'end_date': datetime.date(1929, 10, 1),
'pages': 351
},
{
'id': '116',
'start_date': datetime.date(1929, 10, 1),
'end_date': datetime.date(1929, 12, 22),
'pages': 314
}
]
for vol in vols:
prediction = predict_pages(arrow.get(vol['start_date']), arrow.get(vol['end_date']))
print(f'Volume {vol["id"]}: {prediction} predicted / {vol["pages"]} actual pages')
Volume 113: 310 predicted / 305 actual pages Volume 114: 326 predicted / 326 actual pages Volume 115: 351 predicted / 351 actual pages Volume 116: 314 predicted / 314 actual pages
So out of all of 1929, there seems to be only 5 pages difference. Not too bad!
Let's see if we can find which days and why...
We can use the transcribed data to isolate the missing days.
# Get morning sessions becuase they only have one page
pages = df_headers.loc[(df_headers['T0'] == 'Morning') & (df_headers['date'].notnull()) & (df_headers['T3'] == '1929')][['image', 'date']].sort_values(by='date')
pages = pages.to_dict('records')
for vol in vols:
print(vol['id'])
start_date = arrow.get(vol['start_date'])
for page in pages:
vol_num, page_num = re.search(r'N193\-(\d+)_(\d+)', page['image']).groups()
if vol_num == vol['id']:
prediction = predict_pages(start_date, arrow.get(page['date']))
print(f'{page["date"].date()}: predicted {prediction}, actual {int(page_num) -1}')
113 1929-01-09: predicted 10, actual 10 1929-01-14: predicted 27, actual 27 1929-01-23: predicted 64, actual 64 1929-02-01: predicted 99, actual 99 1929-02-06: predicted 116, actual 116 1929-02-07: predicted 121, actual 121 1929-02-08: predicted 126, actual 126 1929-02-27: predicted 197, actual 197 1929-03-04: predicted 214, actual 213 1929-03-11: predicted 241, actual 240 1929-03-12: predicted 246, actual 245 1929-03-14: predicted 256, actual 253 1929-03-20: predicted 278, actual 275 114 1929-04-05: predicted 5, actual 5 1929-04-10: predicted 22, actual 22 1929-04-18: predicted 54, actual 54 1929-04-22: predicted 66, actual 66 1929-04-26: predicted 81, actual 81 1929-04-29: predicted 88, actual 88 1929-04-30: predicted 93, actual 93 1929-05-01: predicted 98, actual 98 1929-05-02: predicted 103, actual 103 1929-05-06: predicted 115, actual 115 1929-05-13: predicted 142, actual 142 1929-05-17: predicted 162, actual 162 1929-05-21: predicted 174, actual 174 1929-05-24: predicted 189, actual 189 1929-05-28: predicted 201, actual 201 1929-05-29: predicted 206, actual 206 1929-06-18: predicted 277, actual 277 1929-06-20: predicted 287, actual 287 1929-06-21: predicted 292, actual 292 1929-06-24: predicted 299, actual 299 1929-06-27: predicted 314, actual 314 115 1929-07-01: predicted 0, actual 0 1929-07-03: predicted 10, actual 10 1929-07-05: predicted 20, actual 20 1929-07-11: predicted 42, actual 42 1929-07-16: predicted 59, actual 59 1929-07-17: predicted 64, actual 64 1929-07-26: predicted 101, actual 101 1929-07-30: predicted 113, actual 113 1929-08-02: predicted 128, actual 128 1929-08-07: predicted 140, actual 140 1929-08-13: predicted 162, actual 162 1929-08-16: predicted 177, actual 177 1929-08-20: predicted 189, actual 231 1929-08-21: predicted 194, actual 194 1929-08-23: predicted 204, actual 204 1929-08-28: predicted 221, actual 221 1929-09-02: predicted 238, actual 238 1929-09-03: predicted 243, actual 243 1929-09-09: predicted 265, actual 265 1929-09-11: predicted 275, actual 275 1929-09-17: predicted 297, actual 297 1929-09-23: predicted 319, actual 319 1929-09-25: predicted 329, actual 329 116 1929-10-03: predicted 10, actual 10 1929-10-08: predicted 22, actual 22 1929-10-11: predicted 37, actual 37 1929-10-22: predicted 76, actual 76 1929-11-05: predicted 130, actual 130 1929-11-07: predicted 140, actual 140 1929-11-26: predicted 211, actual 211
Missing in 1929
def find_date(day, month, year):
end_date = arrow.get(year, month, day)
start_date = arrow.get(1929, 4, 4)
page = predict_pages(start_date, end_date)
print(page+1)
find_date(6, 6, 1929)
234
pages_1930 = df_headers.loc[(df_headers['T0'] == 'Morning') & (df_headers['date'].notnull()) & (df_headers['T3'] == '1930')][['image', 'date']].sort_values(by='date')
pages_1930 = pages_1930.to_dict('records')
#print(pages_1930)
for row in df_series.dropna(subset=['start_date', 'end_date']).itertuples():
start_date = arrow.get(row.start_date, 'YYYY-MM-DD')
if start_date.year == 1930:
print(row.Item_number)
for page in pages_1930:
vol_num, page_num = re.search(r'(N193\-\d+)_(\d+)', page['image']).groups()
#end_date = arrow.get(page['end_date'].shift(days=+1).date()
if vol_num in row.Item_number:
prediction = predict_pages(start_date, arrow.get(page['date']))
if vol_num == 'N193-120':
print(f'{page["date"].date()}: predicted {prediction}, actual {row.Pages - (int(page_num))}')
else:
print(f'{page["date"].date()}: predicted {prediction}, actual {int(page_num) -1}')
AU NBAC N193-117 1930-01-06: predicted 0, actual 0 1930-01-07: predicted 5, actual 5 1930-01-14: predicted 32, actual 32 1930-01-17: predicted 47, actual 47 1930-02-03: predicted 103, actual 103 1930-02-05: predicted 113, actual 113 1930-02-11: predicted 135, actual 135 1930-02-12: predicted 140, actual 140 1930-02-17: predicted 157, actual 197 1930-02-27: predicted 199, actual 197 1930-03-10: predicted 238, actual 236 1930-03-26: predicted 302, actual 300 1930-03-27: predicted 307, actual 305 AU NBAC N193-118 1930-04-10: predicted 37, actual 37 1930-04-16: predicted 59, actual 59 1930-05-07: predicted 123, actual 104 1930-05-09: predicted 133, actual 114 1930-05-10: predicted 138, actual 99 1930-05-12: predicted 140, actual 119 1930-05-15: predicted 155, actual 134 1930-05-20: predicted 172, actual 151 1930-05-22: predicted 182, actual 161 1930-05-23: predicted 187, actual 166 1930-05-26: predicted 194, actual 173 1930-05-29: predicted 209, actual 188 1930-06-04: predicted 231, actual 210 1930-06-10: predicted 248, actual 227 1930-06-13: predicted 263, actual 242 1930-06-18: predicted 280, actual 259 1930-06-20: predicted 290, actual 269 1930-06-24: predicted 302, actual 281 AU NBAC N193-119 1930-07-01: predicted 0, actual 0 1930-07-02: predicted 5, actual 5 1930-07-17: predicted 64, actual 64 1930-07-25: predicted 96, actual 96 1930-08-07: predicted 140, actual 140 1930-08-27: predicted 216, actual 216 1930-08-29: predicted 226, actual 226 1930-09-01: predicted 233, actual 233 1930-09-03: predicted 243, actual 243 1930-09-04: predicted 248, actual 248 1930-09-11: predicted 275, actual 276 1930-09-12: predicted 280, actual 281 1930-09-15: predicted 287, actual 288 1930-09-18: predicted 302, actual 303 1930-09-24: predicted 324, actual 325 1930-09-30: predicted 346, actual 347 AU NBAC N193-120 1930-10-01: predicted 0, actual 0 1930-10-02: predicted 5, actual 5 1930-10-10: predicted 32, actual 32 1930-10-17: predicted 59, actual 59 1930-10-27: predicted 93, actual 93 1930-10-29: predicted 103, actual 103 1930-11-11: predicted 152, actual 152 1930-11-28: predicted 221, actual 221 1930-12-08: predicted 255, actual 255 1930-12-16: predicted 287, actual 287 1930-12-17: predicted 292, actual 292 1930-12-23: predicted 314, actual 314
Vol 117
Afternoon (2 pages) missing 26 Feb 1930
Vol 118
Missing from afternoon of 17 April 1930 to Saturday 26 April (inclusive) -- 18, 19, 21 were Easter holidays, Anzac Day on 25. Still missing Thursday afternoon (2 pages), Tuesday, Wednesday & Thursday (15 pages), and Saturday 26 (2 pages) -- so 19 pages which matches above. We're these just extra holidays?
Missing afternoon of 9 May (2 pages)
Vol 119
119_264 & 119_265 are duplicates (so there's 1 extra page in this volume)
Vol 120
Images in Volume 120 are numbered backwards!!!!!
Missing afternoon on 23 Dec (2 pages)
df_series = pd.read_excel('Sydney Stock Exchange N193_Tim.xlsx')
df_series = df_series.join(df_series['Coverage'].str.split(' - ', expand=True)).rename({0: 'start_date', 1: 'end_date', 'Item Number': 'Item_number'}, axis=1)
df_series.head()
Item_number | Title | Coverage | Pages | Notes | start_date | end_date | |
---|---|---|---|---|---|---|---|
0 | AU NBAC N193-1 | Sydney Stock Exchange Stock and Share Lists | 1901-01-07 - 1901-03-30 | 303 | NaN | 1901-01-07 | 1901-03-30 |
1 | AU NBAC N193-2 | Sydney Stock Exchange Stock and Share Lists | 1901-04-01 - 1901-06-29 | 312 | NaN | 1901-04-01 | 1901-06-29 |
2 | AU NBAC N193-3 | Sydney Stock Exchange Stock and Share Lists | 1901-07-01 - 1901-09-30 | 345 | NaN | 1901-07-01 | 1901-09-30 |
3 | AU NBAC N193-4 | Sydney Stock Exchange Stock and Share Lists | 1901-10-01 - 1901-12-24 | 312 | NaN | 1901-10-01 | 1901-12-24 |
4 | AU NBAC N193-5 | Sydney Stock Exchange Stock and Share Lists | 1902-01-06 - 1902-03-27 | 305 | NaN | 1902-01-06 | 1902-03-27 |
def pad_vol_nums(vol_ref):
prefix, vol_num = vol_ref.split('-')
return f'{prefix}-{int(vol_num):03}'
df_series[:-1].loc[:, 'Item_number'] = df_series[:-1].loc[:, 'Item_number'].apply(pad_vol_nums)
df_series.head()
/Users/tim/mycode/stock-exchange/lib/python3.7/site-packages/pandas/core/indexing.py:543: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy self.obj[item] = s
Item_number | Title | Coverage | Pages | Notes | start_date | end_date | |
---|---|---|---|---|---|---|---|
0 | AU NBAC N193-001 | Sydney Stock Exchange Stock and Share Lists | 1901-01-07 - 1901-03-30 | 303 | NaN | 1901-01-07 | 1901-03-30 |
1 | AU NBAC N193-002 | Sydney Stock Exchange Stock and Share Lists | 1901-04-01 - 1901-06-29 | 312 | NaN | 1901-04-01 | 1901-06-29 |
2 | AU NBAC N193-003 | Sydney Stock Exchange Stock and Share Lists | 1901-07-01 - 1901-09-30 | 345 | NaN | 1901-07-01 | 1901-09-30 |
3 | AU NBAC N193-004 | Sydney Stock Exchange Stock and Share Lists | 1901-10-01 - 1901-12-24 | 312 | NaN | 1901-10-01 | 1901-12-24 |
4 | AU NBAC N193-005 | Sydney Stock Exchange Stock and Share Lists | 1902-01-06 - 1902-03-27 | 305 | NaN | 1902-01-06 | 1902-03-27 |
df_series[:-1].to_csv('series_list.csv', index=False)
df_series.to_csv('series_list.csv', index=False)