!pip install -q apache_beam
!pip install -q tensorflow-datasets tfds-nightly
|████████████████████████████████| 9.8 MB 5.1 MB/s
|████████████████████████████████| 247 kB 71.2 MB/s
|████████████████████████████████| 151 kB 74.3 MB/s
|████████████████████████████████| 829 kB 31.7 MB/s
|████████████████████████████████| 62 kB 958 kB/s
|████████████████████████████████| 2.3 MB 65.8 MB/s
Building wheel for avro-python3 (setup.py) ... done
Building wheel for dill (setup.py) ... done
Building wheel for future (setup.py) ... done
ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
multiprocess 0.70.12.2 requires dill>=0.3.4, but you have dill 0.3.1.1 which is incompatible.
google-colab 1.0.0 requires requests~=2.23.0, but you have requests 2.26.0 which is incompatible.
datascience 0.10.6 requires folium==0.2.1, but you have folium 0.8.3 which is incompatible.
|████████████████████████████████| 4.1 MB 5.1 MB/s
import tensorflow_datasets as tfds
!mkdir ~/.kaggle
!pip install -q kaggle
kaggle_username =''
kaggle_key = ''
!echo '{"username":kaggle_username,"key":kaggle_key}' > ~/.kaggle/kaggle.json
!kaggle datasets download -d a0155991rliwei/c4-200m -p /content
Warning: Your Kaggle API key is readable by other users on this system! To fix this, you can run 'chmod 600 /root/.kaggle/kaggle.json' Downloading c4-200m.zip to /content 100% 19.3G/19.3G [03:35<00:00, 79.4MB/s] 100% 19.3G/19.3G [03:35<00:00, 96.1MB/s]
!pip install sentencepiece
Collecting sentencepiece Downloading sentencepiece-0.1.96-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB) |████████████████████████████████| 1.2 MB 4.9 MB/s Installing collected packages: sentencepiece Successfully installed sentencepiece-0.1.96
!pip install -q transformers
from transformers import (
AutoConfig,
AutoTokenizer,
AutoModelForSequenceClassification,
)
import pandas as pd
model_name = 't5-base'
# Initialise tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)
Downloading: 0%| | 0.00/1.17k [00:00<?, ?B/s]
Downloading: 0%| | 0.00/773k [00:00<?, ?B/s]
Downloading: 0%| | 0.00/1.32M [00:00<?, ?B/s]
!unzip -q /content/c4-200m.zip
c4_builder = tfds.core.builder_from_directory('/content/c4200m/1.0.0')
INFO:absl:Load dataset info from /content/c4200m/1.0.0
# Metadata are avalailable as usual
num_train_examples = c4_builder.info.splits['train'].num_examples
print(num_train_examples)
183894319
c4_builder.download_and_prepare()
INFO:absl:Reusing dataset c4200m (/content/c4200m/1.0.0)
train_data = c4_builder.as_dataset(split='train', shuffle_files=True)
train_df = tfds.as_dataframe(train_data.take(550000))
train_df.shape
INFO:absl:Constructing tf.data.Dataset c4200m for split train, from /content/c4200m/1.0.0
(550000, 2)
train_df.head()
input | output | |
---|---|---|
0 | b"Can be empenty'' for more damage, but not ne... | b'Can be empathy for more damage, but not need... |
1 | b'Miguelx completed Pollster Badge.' | b'Miguelx completed the Pollster Badge.' |
2 | b'This classic three day itinerary is take you... | b'This classic three-day itinerary takes you t... |
3 | b'Kimbrelle shares an inspirational story wher... | b'Kimbrelle shares an inspirational story wher... |
4 | b'Variation: The utility players get a guideli... | b'Variation: The utility players get a guideli... |
train_df['input'] = train_df['input'].str.decode(encoding = 'UTF-8')
train_df['output'] = train_df['output'].str.decode(encoding = 'UTF-8')
train_df.head()
input | output | |
---|---|---|
0 | The steps below describe how to remove data fo... | The steps below describe how to remove data fo... |
1 | When I wake up it\'s usually comes out dreamsI... | When I wake up it\'s usually dreams I\'m think... |
2 | One of the cardinal factors to be considered t... | One of the cardinal factors to consider when t... |
3 | Answers » Regions » Is in Nagorno-Karabakt reg... | Answers » Regions » Is Nagorno-Karabakh region... |
4 | Flaneuring in fun at maple creek SK! | Flaneuring Fun in Maple Creek SK! |
train_df.to_csv('/content/drive/MyDrive/c4_200m/c4_200m_550k.csv', index=False)
train_data = c4_builder.as_dataset(split='train', shuffle_files=True)
train_df = tfds.as_dataframe(train_data.take(550000))
train_df.shape
INFO:absl:Constructing tf.data.Dataset c4200m for split train, from /content/c4200m/1.0.0
(1000000, 2)
train_df['input'] = train_df['input'].str.decode(encoding = 'UTF-8')
train_df['output'] = train_df['output'].str.decode(encoding = 'UTF-8')
train_df.head()
input | output | |
---|---|---|
0 | Medell he, Ohio W. Shannon Kansas,R. C. | Medell, Ohio; W. Shannon. Kansas; R. C. |
1 | quarter of 1999 $ 25 million was repaid under ... | quarter of 1999, $25 million was repaid under ... |
2 | It used as service center by the Block office ... | It can be used as a service center by the Bloc... |
3 | Tom offered two this time, one of old restaura... | Tom offered two this time, one of old restaura... |
4 | You can see 'Spring beauties' at The Botanical... | You can see Spring Beauties at The Botanical G... |
train_df.to_csv('/content/drive/MyDrive/c4_200m/c4_200m_1M.csv', index=False)