In [ ]:

!pip install -q apache_beam
!pip install -q tensorflow-datasets tfds-nightly

     |████████████████████████████████| 9.8 MB 5.1 MB/s 
     |████████████████████████████████| 247 kB 71.2 MB/s 
     |████████████████████████████████| 151 kB 74.3 MB/s 
     |████████████████████████████████| 829 kB 31.7 MB/s 
     |████████████████████████████████| 62 kB 958 kB/s 
     |████████████████████████████████| 2.3 MB 65.8 MB/s 
  Building wheel for avro-python3 (setup.py) ... done
  Building wheel for dill (setup.py) ... done
  Building wheel for future (setup.py) ... done
ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
multiprocess 0.70.12.2 requires dill>=0.3.4, but you have dill 0.3.1.1 which is incompatible.
google-colab 1.0.0 requires requests~=2.23.0, but you have requests 2.26.0 which is incompatible.
datascience 0.10.6 requires folium==0.2.1, but you have folium 0.8.3 which is incompatible.
     |████████████████████████████████| 4.1 MB 5.1 MB/s

In [ ]:

import tensorflow_datasets as tfds

In [ ]:

!mkdir ~/.kaggle
!pip install -q kaggle

In [ ]:

kaggle_username =''
kaggle_key = ''
!echo '{"username":kaggle_username,"key":kaggle_key}' > ~/.kaggle/kaggle.json

In [ ]:

!kaggle datasets download -d a0155991rliwei/c4-200m -p /content

Warning: Your Kaggle API key is readable by other users on this system! To fix this, you can run 'chmod 600 /root/.kaggle/kaggle.json'
Downloading c4-200m.zip to /content
100% 19.3G/19.3G [03:35<00:00, 79.4MB/s]
100% 19.3G/19.3G [03:35<00:00, 96.1MB/s]

Load the dataset and shard it¶

In [ ]:

!pip install sentencepiece

Collecting sentencepiece
  Downloading sentencepiece-0.1.96-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
     |████████████████████████████████| 1.2 MB 4.9 MB/s 
Installing collected packages: sentencepiece
Successfully installed sentencepiece-0.1.96

In [ ]:

!pip install -q transformers

In [ ]:

from transformers import (
   AutoConfig,
   AutoTokenizer,
   AutoModelForSequenceClassification,
)
import pandas as pd

In [ ]:

model_name = 't5-base'
# Initialise tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)

Downloading:   0%|          | 0.00/1.17k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/773k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.32M [00:00<?, ?B/s]

In [ ]:

!unzip -q /content/c4-200m.zip

In [ ]:

c4_builder = tfds.core.builder_from_directory('/content/c4200m/1.0.0')

INFO:absl:Load dataset info from /content/c4200m/1.0.0

In [ ]:

# Metadata are avalailable as usual
num_train_examples = c4_builder.info.splits['train'].num_examples
print(num_train_examples)

183894319

In [ ]:

c4_builder.download_and_prepare()

INFO:absl:Reusing dataset c4200m (/content/c4200m/1.0.0)

In [ ]:

train_data = c4_builder.as_dataset(split='train', shuffle_files=True)
train_df = tfds.as_dataframe(train_data.take(550000))
train_df.shape

INFO:absl:Constructing tf.data.Dataset c4200m for split train, from /content/c4200m/1.0.0

Out[ ]:

(550000, 2)

In [ ]:

train_df.head()

Out[ ]:

	input	output
0	b"Can be empenty'' for more damage, but not ne...	b'Can be empathy for more damage, but not need...
1	b'Miguelx completed Pollster Badge.'	b'Miguelx completed the Pollster Badge.'
2	b'This classic three day itinerary is take you...	b'This classic three-day itinerary takes you t...
3	b'Kimbrelle shares an inspirational story wher...	b'Kimbrelle shares an inspirational story wher...
4	b'Variation: The utility players get a guideli...	b'Variation: The utility players get a guideli...

In [ ]:

train_df['input'] = train_df['input'].str.decode(encoding = 'UTF-8')
train_df['output'] = train_df['output'].str.decode(encoding = 'UTF-8')
train_df.head()

Out[ ]:

	input	output
0	The steps below describe how to remove data fo...	The steps below describe how to remove data fo...
1	When I wake up it\'s usually comes out dreamsI...	When I wake up it\'s usually dreams I\'m think...
2	One of the cardinal factors to be considered t...	One of the cardinal factors to consider when t...
3	Answers » Regions » Is in Nagorno-Karabakt reg...	Answers » Regions » Is Nagorno-Karabakh region...
4	Flaneuring in fun at maple creek SK!	Flaneuring Fun in Maple Creek SK!

In [ ]:

train_df.to_csv('/content/drive/MyDrive/c4_200m/c4_200m_550k.csv', index=False)

Also Create a 1MM dataset¶

In [ ]:

train_data = c4_builder.as_dataset(split='train', shuffle_files=True)
train_df = tfds.as_dataframe(train_data.take(550000))
train_df.shape

INFO:absl:Constructing tf.data.Dataset c4200m for split train, from /content/c4200m/1.0.0

Out[ ]:

(1000000, 2)

In [ ]:

train_df['input'] = train_df['input'].str.decode(encoding = 'UTF-8')
train_df['output'] = train_df['output'].str.decode(encoding = 'UTF-8')
train_df.head()

Out[ ]:

	input	output
0	Medell he, Ohio W. Shannon Kansas,R. C.	Medell, Ohio; W. Shannon. Kansas; R. C.
1	quarter of 1999 $ 25 million was repaid under ...	quarter of 1999, $25 million was repaid under ...
2	It used as service center by the Block office ...	It can be used as a service center by the Bloc...
3	Tom offered two this time, one of old restaura...	Tom offered two this time, one of old restaura...
4	You can see 'Spring beauties' at The Botanical...	You can see Spring Beauties at The Botanical G...

In [ ]:

train_df.to_csv('/content/drive/MyDrive/c4_200m/c4_200m_1M.csv', index=False)

In [ ]: