! pip install tokenizer datasets sentencepiece protobuf==3.20.0
! nvidia-smi
Requirement already satisfied: tokenizer in /usr/local/lib/python3.9/dist-packages (3.4.2) Collecting datasets Downloading datasets-2.7.1-py3-none-any.whl (451 kB) ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 451.7/451.7 KB 34.3 MB/s eta 0:00:00 Requirement already satisfied: sentencepiece in /usr/local/lib/python3.9/dist-packages (0.1.97) Requirement already satisfied: protobuf==3.20.0 in /usr/local/lib/python3.9/dist-packages (3.20.0) Requirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.9/dist-packages (from datasets) (1.23.5) Collecting fsspec[http]>=2021.11.1 Downloading fsspec-2022.11.0-py3-none-any.whl (139 kB) ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 139.5/139.5 KB 38.6 MB/s eta 0:00:00 Requirement already satisfied: tqdm>=4.62.1 in /usr/local/lib/python3.9/dist-packages (from datasets) (4.64.1) Collecting xxhash Downloading xxhash-3.1.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (212 kB) ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 212.0/212.0 KB 46.3 MB/s eta 0:00:00 Collecting multiprocess Downloading multiprocess-0.70.14-py39-none-any.whl (132 kB) ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 132.9/132.9 KB 37.1 MB/s eta 0:00:00 Collecting pyarrow>=6.0.0 Downloading pyarrow-10.0.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (35.9 MB) ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 35.9/35.9 MB 71.6 MB/s eta 0:00:0000:0100:01 Collecting pandas Downloading pandas-1.5.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (12.2 MB) ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 12.2/12.2 MB 127.0 MB/s eta 0:00:0000:0100:01 Requirement already satisfied: pyyaml>=5.1 in /usr/local/lib/python3.9/dist-packages (from datasets) (6.0) Requirement already satisfied: packaging in /usr/local/lib/python3.9/dist-packages (from datasets) (22.0) Requirement already satisfied: huggingface-hub<1.0.0,>=0.2.0 in /usr/local/lib/python3.9/dist-packages (from datasets) (0.11.1) Collecting aiohttp Downloading aiohttp-3.8.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.0 MB) ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 1.0/1.0 MB 84.3 MB/s eta 0:00:00 Collecting dill<0.3.7 Downloading dill-0.3.6-py3-none-any.whl (110 kB) ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 110.5/110.5 KB 27.7 MB/s eta 0:00:00 Requirement already satisfied: requests>=2.19.0 in /usr/lib/python3/dist-packages (from datasets) (2.22.0) Collecting responses<0.19 Downloading responses-0.18.0-py3-none-any.whl (38 kB) Collecting aiosignal>=1.1.2 Downloading aiosignal-1.3.1-py3-none-any.whl (7.6 kB) Collecting yarl<2.0,>=1.0 Downloading yarl-1.8.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (264 kB) ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 264.6/264.6 KB 62.1 MB/s eta 0:00:00 Collecting frozenlist>=1.1.1 Downloading frozenlist-1.3.3-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (158 kB) ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 158.8/158.8 KB 31.4 MB/s eta 0:00:00 Requirement already satisfied: attrs>=17.3.0 in /usr/local/lib/python3.9/dist-packages (from aiohttp->datasets) (22.1.0) Collecting async-timeout<5.0,>=4.0.0a3 Downloading async_timeout-4.0.2-py3-none-any.whl (5.8 kB) Collecting charset-normalizer<3.0,>=2.0 Downloading charset_normalizer-2.1.1-py3-none-any.whl (39 kB) Collecting multidict<7.0,>=4.5 Downloading multidict-6.0.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (114 kB) ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 114.2/114.2 KB 28.1 MB/s eta 0:00:00 Requirement already satisfied: typing-extensions>=3.7.4.3 in /usr/local/lib/python3.9/dist-packages (from huggingface-hub<1.0.0,>=0.2.0->datasets) (4.4.0) Requirement already satisfied: filelock in /usr/local/lib/python3.9/dist-packages (from huggingface-hub<1.0.0,>=0.2.0->datasets) (3.8.2) Collecting urllib3>=1.25.10 Downloading urllib3-1.26.13-py2.py3-none-any.whl (140 kB) ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 140.6/140.6 KB 21.1 MB/s eta 0:00:00 Requirement already satisfied: python-dateutil>=2.8.1 in /usr/local/lib/python3.9/dist-packages (from pandas->datasets) (2.8.2) Collecting pytz>=2020.1 Downloading pytz-2022.6-py2.py3-none-any.whl (498 kB) ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 498.1/498.1 KB 55.9 MB/s eta 0:00:00 Requirement already satisfied: six>=1.5 in /usr/lib/python3/dist-packages (from python-dateutil>=2.8.1->pandas->datasets) (1.14.0) Requirement already satisfied: idna>=2.0 in /usr/lib/python3/dist-packages (from yarl<2.0,>=1.0->aiohttp->datasets) (2.8) Installing collected packages: pytz, xxhash, urllib3, pyarrow, multidict, fsspec, frozenlist, dill, charset-normalizer, async-timeout, yarl, responses, pandas, multiprocess, aiosignal, aiohttp, datasets Attempting uninstall: urllib3 Found existing installation: urllib3 1.25.8 Uninstalling urllib3-1.25.8: Successfully uninstalled urllib3-1.25.8 Successfully installed aiohttp-3.8.3 aiosignal-1.3.1 async-timeout-4.0.2 charset-normalizer-2.1.1 datasets-2.7.1 dill-0.3.6 frozenlist-1.3.3 fsspec-2022.11.0 multidict-6.0.3 multiprocess-0.70.14 pandas-1.5.2 pyarrow-10.0.1 pytz-2022.6 responses-0.18.0 urllib3-1.26.13 xxhash-3.1.0 yarl-1.8.2 WARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv WARNING: You are using pip version 22.0.4; however, version 22.3.1 is available. You should consider upgrading via the '/usr/bin/python3.9 -m pip install --upgrade pip' command. Thu Dec 15 13:32:01 2022 +-----------------------------------------------------------------------------+ | NVIDIA-SMI 470.57.02 Driver Version: 470.57.02 CUDA Version: 11.6 | |-------------------------------+----------------------+----------------------+ | GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC | | Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. | | | | MIG M. | |===============================+======================+======================| | 0 NVIDIA A10G On | 00000000:00:1E.0 Off | 0 | | 0% 29C P0 69W / 300W | 2154MiB / 22731MiB | 0% Default | | | | N/A | +-------------------------------+----------------------+----------------------+ +-----------------------------------------------------------------------------+ | Processes: | | GPU GI CI PID Type Process name GPU Memory | | ID ID Usage | |=============================================================================| +-----------------------------------------------------------------------------+
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from datasets import load_dataset
import time
import torch
/usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (1.26.13) or chardet (3.0.4) doesn't match a supported version! warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
model_name = "BaptisteDoyen/camembert-base-xnli"
model = AutoModelForSequenceClassification.from_pretrained(model_name)
model = model.eval().cuda()
model_opt = AutoModelForSequenceClassification.from_pretrained(model_name)
model_opt = model_opt.eval().cuda()
tokenizer = AutoTokenizer.from_pretrained(model_name)
dataset = load_dataset(path="xnli", name="fr")
Downloading: 0%| | 0.00/882 [00:00<?, ?B/s]
Downloading: 0%| | 0.00/443M [00:00<?, ?B/s]
Downloading: 0%| | 0.00/433 [00:00<?, ?B/s]
Downloading: 0%| | 0.00/811k [00:00<?, ?B/s]
Downloading: 0%| | 0.00/299 [00:00<?, ?B/s]
Downloading builder script: 0%| | 0.00/8.78k [00:00<?, ?B/s]
Downloading metadata: 0%| | 0.00/36.6k [00:00<?, ?B/s]
Downloading readme: 0%| | 0.00/18.1k [00:00<?, ?B/s]
Downloading and preparing dataset xnli/fr to /root/.cache/huggingface/datasets/xnli/fr/1.1.0/818164464f9c9fd15776ca8a00423b074344c3e929d00a2c1a84aa5a50c928bd...
Downloading data files: 0%| | 0/2 [00:00<?, ?it/s]
Downloading data: 0%| | 0.00/466M [00:00<?, ?B/s]
Downloading data: 0%| | 0.00/17.9M [00:00<?, ?B/s]
Extracting data files #1: 0%| | 0/1 [00:00<?, ?obj/s]
Extracting data files #0: 0%| | 0/1 [00:00<?, ?obj/s]
Generating train split: 0%| | 0/392702 [00:00<?, ? examples/s]
Generating test split: 0%| | 0/5010 [00:00<?, ? examples/s]
Generating validation split: 0%| | 0/2490 [00:00<?, ? examples/s]
Dataset xnli downloaded and prepared to /root/.cache/huggingface/datasets/xnli/fr/1.1.0/818164464f9c9fd15776ca8a00423b074344c3e929d00a2c1a84aa5a50c928bd. Subsequent calls will reuse this data.
0%| | 0/3 [00:00<?, ?it/s]
Below we do a warmup, it builds the triton kernels optimized for each size.
from kernl.model_optimization import optimize_model
optimize_model(model_opt)
start = time.perf_counter()
shapes = [(1, w) for w in range(8, 128 + 8, 8)]
with torch.inference_mode(), torch.cuda.amp.autocast(enabled=True, dtype=torch.float16, cache_enabled=True):
for s in shapes:
inputs = {
"input_ids": torch.ones(s, device="cuda", dtype=torch.long),
"attention_mask": torch.ones(s, device="cuda", dtype=torch.long),
}
_ = model_opt(**inputs)
_ = model(**inputs)
print(f"{time.perf_counter() - start:.0f}s")
370s
complete_time_baseline = 0
score_baseline = 0
complete_time_optimized = 0
score_optimize = 0
nb_examples = len(dataset["test"])
nb_disagree = 0
with torch.inference_mode(), torch.cuda.amp.autocast(enabled=True, dtype=torch.float16, cache_enabled=True):
for index, content in enumerate(dataset["test"]):
premise, hypothesis, label = content.values()
inputs = tokenizer(premise, hypothesis, return_tensors="pt", pad_to_multiple_of=8, padding=True)
inputs = dict(inputs.to("cuda"))
torch.cuda.synchronize()
start = time.perf_counter()
output_original = model(**inputs)
torch.cuda.synchronize()
complete_time_baseline += time.perf_counter() - start
choice_baseline = torch.argmax(output_original.logits, dim=1)
score_baseline += label == choice_baseline.item()
start = time.perf_counter()
output_optimized = model_opt(**inputs)
torch.cuda.synchronize()
complete_time_optimized += time.perf_counter() - start
choice_optimize = torch.argmax(output_optimized.logits, dim=1)
score_optimize += label == choice_optimize.item()
assert torch.allclose(
output_original.logits, output_optimized.logits, atol=1e-1
), f"logits don't match:\n{output_original}\n{output_optimized}"
if choice_baseline != choice_optimize:
nb_disagree += 1
print(f"{complete_time_baseline=:.2f}s")
print(f"{complete_time_optimized=:.2f}s")
print(f"{nb_disagree=}")
print(f"score baseline: {score_baseline / nb_examples:.2f}")
print(f"score optimize: {score_optimize / nb_examples:.2f}")
complete_time_baseline=38.08s complete_time_optimized=5.25s nb_disagree=1 score baseline: 0.82 score optimize: 0.82