#python #openvino #openvino-notebooks #deeplearning #accelerated-inference #quantization #nncf #optimization #pytorch
302-pytorch-quantization-aware-training: Optimizing PyTorch models with Neural Network Compression Framework of OpenVINO by 8-bit quantization# Imports and Settings # On Windows, add the directory that contains cl.exe to the PATH to enable PyTorch to find the # required C++ tools. This code assumes that Visual Studio 2019 is installed in the default # directory. If you have a different C++ compiler, please add the correct path to os.environ["PATH"] # directly. Note that the C++ Redistributable is not enough to run this notebook. # Adding the path to os.environ["LIB"] is not always required - it depends on the system's configuration import sys if sys.platform == "win32": import distutils.command.build_ext import os from pathlib import Path VS_INSTALL_DIR = r"C:/Program Files (x86)/Microsoft Visual Studio" cl_paths = sorted(list(Path(VS_INSTALL_DIR).glob("**/Hostx86/x64/cl.exe"))) if len(cl_paths) == 0: raise ValueError( "Cannot find Visual Studio. This notebook requires a C++ compiler. If you installed " "a C++ compiler, please add the directory that contains cl.exe to `os.environ['PATH']`." ) else: # If multiple versions of MSVC are installed, get the most recent version cl_path = cl_paths[-1] vs_dir = str(cl_path.parent) os.environ["PATH"] += f"{os.pathsep}{vs_dir}" # Code for finding the library dirs from # https://stackoverflow.com/questions/47423246/get-pythons-lib-path d = distutils.core.Distribution() b = distutils.command.build_ext.build_ext(d) b.finalize_options() os.environ["LIB"] = os.pathsep.join(b.library_dirs) print(f"Added {vs_dir} to PATH") import sys import time import warnings # to disable warnings on export to ONNX import zipfile from pathlib import Path import logging import torch import nncf # Important - should be imported directly after torch import torch.nn as nn import torch.nn.parallel import torch.optim import torch.utils.data import torch.utils.data.distributed import torchvision.datasets as datasets import torchvision.models as models import torchvision.transforms as transforms from nncf.common.utils.logger import set_log_level set_log_level(logging.ERROR) # Disables all NNCF info and warning messages from nncf import NNCFConfig from nncf.torch import create_compressed_model, register_default_init_args from openvino.runtime import Core from torch.jit import TracerWarning sys.path.append("../utils") from notebook_utils import download_file torch.manual_seed(0) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") print(f"Using {device} device") MODEL_DIR = Path("model") OUTPUT_DIR = Path("output") DATA_DIR = Path("data") BASE_MODEL_NAME = "resnet18" image_size = 64 OUTPUT_DIR.mkdir(exist_ok=True) MODEL_DIR.mkdir(exist_ok=True) DATA_DIR.mkdir(exist_ok=True) # Paths where PyTorch, ONNX and OpenVINO IR models will be stored fp32_pth_path = Path(MODEL_DIR / (BASE_MODEL_NAME + "_fp32")).with_suffix(".pth") fp32_onnx_path = Path(OUTPUT_DIR / (BASE_MODEL_NAME + "_fp32")).with_suffix(".onnx") fp32_ir_path = fp32_onnx_path.with_suffix(".xml") int8_onnx_path = Path(OUTPUT_DIR / (BASE_MODEL_NAME + "_int8")).with_suffix(".onnx") int8_ir_path = int8_onnx_path.with_suffix(".xml") # It's possible to train FP32 model from scratch, but it might be slow. So the pre-trained weights are downloaded by default. pretrained_on_tiny_imagenet = True fp32_pth_url = "https://storage.openvinotoolkit.org/repositories/nncf/openvino_notebook_ckpts/302_resnet18_fp32_v1.pth" download_file(fp32_pth_url, directory=MODEL_DIR, filename=fp32_pth_path.name) # Download Tiny ImageNet dataset def download_tiny_imagenet_200( data_dir: Path, url="http://cs231n.stanford.edu/tiny-imagenet-200.zip", tarname="tiny-imagenet-200.zip", ): archive_path = data_dir / tarname download_file(url, directory=data_dir, filename=tarname) zip_ref = zipfile.ZipFile(archive_path, "r") zip_ref.extractall(path=data_dir) zip_ref.close() def prepare_tiny_imagenet_200(dataset_dir: Path): # format validation set the same way as train set is formatted val_data_dir = dataset_dir / 'val' val_annotations_file = val_data_dir / 'val_annotations.txt' with open(val_annotations_file, 'r') as f: val_annotation_data = map(lambda line: line.split('\t')[:2], f.readlines()) val_images_dir = val_data_dir / 'images' for image_filename, image_label in val_annotation_data: from_image_filepath = val_images_dir / image_filename to_image_dir = val_data_dir / image_label if not to_image_dir.exists(): to_image_dir.mkdir() to_image_filepath = to_image_dir / image_filename from_image_filepath.rename(to_image_filepath) val_annotations_file.unlink() val_images_dir.rmdir() DATASET_DIR = DATA_DIR / "tiny-imagenet-200" if not DATASET_DIR.exists(): download_tiny_imagenet_200(DATA_DIR) prepare_tiny_imagenet_200(DATASET_DIR) print(f"Successfully downloaded and prepared dataset at: {DATASET_DIR}") # Pre-train Floating-Point Model # Train Function def train(train_loader, model, criterion, optimizer, epoch): batch_time = AverageMeter("Time", ":3.3f") losses = AverageMeter("Loss", ":2.3f") top1 = AverageMeter("Acc@1", ":2.2f") top5 = AverageMeter("Acc@5", ":2.2f") progress = ProgressMeter( len(train_loader), [batch_time, losses, top1, top5], prefix="Epoch:[{}]".format(epoch) ) # switch to train mode model.train() end = time.time() for i, (images, target) in enumerate(train_loader): images = images.to(device) target = target.to(device) # compute output output = model(images) loss = criterion(output, target) # measure accuracy and record loss acc1, acc5 = accuracy(output, target, topk=(1, 5)) losses.update(loss.item(), images.size(0)) top1.update(acc1[0], images.size(0)) top5.update(acc5[0], images.size(0)) # compute gradient and do opt step optimizer.zero_grad() loss.backward() optimizer.step() # measure elapsed time batch_time.update(time.time() - end) end = time.time() print_frequency = 50 if i % print_frequency == 0: progress.display(i) # Validate Function def validate(val_loader, model, criterion): batch_time = AverageMeter("Time", ":3.3f") losses = AverageMeter("Loss", ":2.3f") top1 = AverageMeter("Acc@1", ":2.2f") top5 = AverageMeter("Acc@5", ":2.2f") progress = ProgressMeter(len(val_loader), [batch_time, losses, top1, top5], prefix="Test: ") # switch to evaluate mode model.eval() with torch.no_grad(): end = time.time() for i, (images, target) in enumerate(val_loader): images = images.to(device) target = target.to(device) # compute output output = model(images) loss = criterion(output, target) # measure accuracy and record loss acc1, acc5 = accuracy(output, target, topk=(1, 5)) losses.update(loss.item(), images.size(0)) top1.update(acc1[0], images.size(0)) top5.update(acc5[0], images.size(0)) # measure elapsed time batch_time.update(time.time() - end) end = time.time() print_frequency = 10 if i % print_frequency == 0: progress.display(i) print(" * Acc@1 {top1.avg:.3f} Acc@5 {top5.avg:.3f}".format(top1=top1, top5=top5)) return top1.avg # Helpers class AverageMeter(object): """Computes and stores the average and current value""" def __init__(self, name, fmt=":f"): self.name = name self.fmt = fmt self.reset() def reset(self): self.val = 0 self.avg = 0 self.sum = 0 self.count = 0 def update(self, val, n=1): self.val = val self.sum += val * n self.count += n self.avg = self.sum / self.count def __str__(self): fmtstr = "{name} {val" + self.fmt + "} ({avg" + self.fmt + "})" return fmtstr.format(**self.__dict__) class ProgressMeter(object): def __init__(self, num_batches, meters, prefix=""): self.batch_fmtstr = self._get_batch_fmtstr(num_batches) self.meters = meters self.prefix = prefix def display(self, batch): entries = [self.prefix + self.batch_fmtstr.format(batch)] entries += [str(meter) for meter in self.meters] print("\t".join(entries)) def _get_batch_fmtstr(self, num_batches): num_digits = len(str(num_batches // 1)) fmt = "{:" + str(num_digits) + "d}" return "[" + fmt + "/" + fmt.format(num_batches) + "]" def accuracy(output, target, topk=(1,)): """Computes the accuracy over the k top predictions for the specified values of k""" with torch.no_grad(): maxk = max(topk) batch_size = target.size(0) _, pred = output.topk(maxk, 1, True, True) pred = pred.t() correct = pred.eq(target.view(1, -1).expand_as(pred)) res = [] for k in topk: correct_k = correct[:k].reshape(-1).float().sum(0, keepdim=True) res.append(correct_k.mul_(100.0 / batch_size)) return res # Get a Pre-trained FP32 Model num_classes = 200 # 200 is for Tiny ImageNet, default is 1000 for ImageNet init_lr = 1e-4 batch_size = 128 epochs = 4 model = models.resnet18(pretrained=not pretrained_on_tiny_imagenet) # update the last FC layer for Tiny ImageNet number of classes model.fc = nn.Linear(in_features=512, out_features=num_classes, bias=True) model.to(device) # Data loading code train_dir = DATASET_DIR / "train" val_dir = DATASET_DIR / "val" normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) train_dataset = datasets.ImageFolder( train_dir, transforms.Compose( [ transforms.Resize(image_size), transforms.RandomHorizontalFlip(), transforms.ToTensor(), normalize, ] ), ) val_dataset = datasets.ImageFolder( val_dir, transforms.Compose( [ transforms.Resize(image_size), transforms.ToTensor(), normalize, ] ), ) train_loader = torch.utils.data.DataLoader( train_dataset, batch_size=batch_size, shuffle=True, num_workers=4, pin_memory=True, sampler=None ) val_loader = torch.utils.data.DataLoader( val_dataset, batch_size=batch_size, shuffle=False, num_workers=4, pin_memory=True ) # define loss function (criterion) and optimizer criterion = nn.CrossEntropyLoss().to(device) optimizer = torch.optim.Adam(model.parameters(), lr=init_lr) if pretrained_on_tiny_imagenet: # # ** WARNING: torch.load functionality uses Python's pickling module that # may be used to perform arbitrary code execution during unpickling. Only load data that you # trust. # checkpoint = torch.load(str(fp32_pth_path), map_location="cpu") model.load_state_dict(checkpoint["state_dict"], strict=True) acc1_fp32 = checkpoint["acc1"] else: best_acc1 = 0 # Training loop for epoch in range(0, epochs): # run a single training epoch train(train_loader, model, criterion, optimizer, epoch) # evaluate on validation set acc1 = validate(val_loader, model, criterion) is_best = acc1 > best_acc1 best_acc1 = max(acc1, best_acc1) if is_best: checkpoint = {"state_dict": model.state_dict(), "acc1": acc1} torch.save(checkpoint, fp32_pth_path) acc1_fp32 = best_acc1 print(f"Accuracy of FP32 model: {acc1_fp32:.3f}") dummy_input = torch.randn(1, 3, image_size, image_size).to(device) torch.onnx.export(model, dummy_input, fp32_onnx_path) print(f"FP32 ONNX model was exported to {fp32_onnx_path}.") # Create and Initialize Quantization nncf_config_dict = { "input_info": {"sample_size": [1, 3, image_size, image_size]}, "log_dir": str(OUTPUT_DIR), # log directory for NNCF-specific logging outputs "compression": { "algorithm": "quantization", # specify the algorithm here }, } nncf_config = NNCFConfig.from_dict(nncf_config_dict) nncf_config = register_default_init_args(nncf_config, train_loader) compression_ctrl, model = create_compressed_model(model, nncf_config) acc1 = validate(val_loader, model, criterion) print(f"Accuracy of initialized INT8 model: {acc1:.3f}") # Fine-tune the Compressed Model compression_lr = init_lr / 10 optimizer = torch.optim.Adam(model.parameters(), lr=compression_lr) # train for one epoch with NNCF train(train_loader, model, criterion, optimizer, epoch=0) # evaluate on validation set after Quantization-Aware Training (QAT case) acc1_int8 = validate(val_loader, model, criterion) print(f"Accuracy of tuned INT8 model: {acc1_int8:.3f}") print(f"Accuracy drop of tuned INT8 model over pre-trained FP32 model: {acc1_fp32 - acc1_int8:.3f}") # Export INT8 Model to ONNX if not int8_onnx_path.exists(): warnings.filterwarnings("ignore", category=TracerWarning) warnings.filterwarnings("ignore", category=UserWarning) # Export INT8 model to ONNX that is supported by the OpenVINO™ toolkit compression_ctrl.export_model(int8_onnx_path) print(f"INT8 ONNX model exported to {int8_onnx_path}.") # Convert ONNX models to OpenVINO Intermediate Representation (IR) if not fp32_ir_path.exists(): !mo --input_model $fp32_onnx_path --input_shape "[1,3, $image_size, $image_size]" --mean_values "[123.675, 116.28 , 103.53]" --scale_values "[58.395, 57.12 , 57.375]" --data_type FP16 --output_dir $OUTPUT_DIR if not int8_ir_path.exists(): !mo --input_model $int8_onnx_path --input_shape "[1,3, $image_size, $image_size]" --mean_values "[123.675, 116.28 , 103.53]" --scale_values "[58.395, 57.12 , 57.375]" --data_type FP16 --output_dir $OUTPUT_DIR # Benchmark Model Performance by Computing Inference Time def parse_benchmark_output(benchmark_output): parsed_output = [line for line in benchmark_output if not (line.startswith(r"[") or line.startswith(" ") or line == "")] print(*parsed_output, sep='\n') print('Benchmark FP32 model (IR)') benchmark_output = ! benchmark_app -m $fp32_ir_path -d CPU -api async -t 15 parse_benchmark_output(benchmark_output) print('Benchmark INT8 model (IR)') benchmark_output = ! benchmark_app -m $int8_ir_path -d CPU -api async -t 15 parse_benchmark_output(benchmark_output) # Show CPU Information for reference ie = Core() ie.get_property(device_name="CPU", name="FULL_DEVICE_NAME")
#python #openvino #openvino-notebooks #deeplearning #accelerated-inference #tensorflow #quantization #nncf #optimization
305-tensorflow-quantization-aware-training: Optimizing TensorFlow models with Neural Network Compression Framework of OpenVINO by 8-bit quantization# Imports and Settings from pathlib import Path import logging import tensorflow as tf import tensorflow_datasets as tfds from tensorflow.python.keras import layers from tensorflow.python.keras import models from nncf import NNCFConfig from nncf.tensorflow.helpers.model_creation import create_compressed_model from nncf.tensorflow.initialization import register_default_init_args from nncf.common.utils.logger import set_log_level set_log_level(logging.ERROR) MODEL_DIR = Path("model") OUTPUT_DIR = Path("output") MODEL_DIR.mkdir(exist_ok=True) OUTPUT_DIR.mkdir(exist_ok=True) BASE_MODEL_NAME = "ResNet-18" fp32_h5_path = Path(MODEL_DIR / (BASE_MODEL_NAME + "_fp32")).with_suffix(".h5") fp32_sm_path = Path(OUTPUT_DIR / (BASE_MODEL_NAME + "_fp32")) fp32_ir_path = Path(OUTPUT_DIR / "saved_model").with_suffix(".xml") int8_pb_path = Path(OUTPUT_DIR / (BASE_MODEL_NAME + "_int8")).with_suffix(".pb") int8_pb_name = Path(BASE_MODEL_NAME + "_int8").with_suffix(".pb") int8_ir_path = int8_pb_path.with_suffix(".xml") BATCH_SIZE = 128 IMG_SIZE = (64, 64) # Default Imagenet image size NUM_CLASSES = 10 # For Imagenette dataset LR = 1e-5 MEAN_RGB = (0.485 * 255, 0.456 * 255, 0.406 * 255) # From Imagenet dataset STDDEV_RGB = (0.229 * 255, 0.224 * 255, 0.225 * 255) # From Imagenet dataset fp32_pth_url = "https://storage.openvinotoolkit.org/repositories/nncf/openvino_notebook_ckpts/305_resnet18_imagenette_fp32_v1.h5" _ = tf.keras.utils.get_file(fp32_h5_path.resolve(), fp32_pth_url) print(f'Absolute path where the model weights are saved:\n {fp32_h5_path.resolve()}') # Dataset Preprocessing datasets, datasets_info = tfds.load('imagenette/160px', shuffle_files=True, as_supervised=True, with_info=True, read_config=tfds.ReadConfig(shuffle_seed=0)) train_dataset, validation_dataset = datasets['train'], datasets['validation'] fig = tfds.show_examples(train_dataset, datasets_info) def preprocessing(image, label): image = tf.image.resize(image, IMG_SIZE) image = image - MEAN_RGB image = image / STDDEV_RGB label = tf.one_hot(label, NUM_CLASSES) return image, label train_dataset = (train_dataset.map(preprocessing, num_parallel_calls=tf.data.experimental.AUTOTUNE) .batch(BATCH_SIZE) .prefetch(tf.data.experimental.AUTOTUNE)) validation_dataset = (validation_dataset.map(preprocessing, num_parallel_calls=tf.data.experimental.AUTOTUNE) .batch(BATCH_SIZE) .prefetch(tf.data.experimental.AUTOTUNE)) # Define a Floating-Point Model def residual_conv_block(filters, stage, block, strides=(1, 1), cut='pre'): def layer(input_tensor): x = layers.BatchNormalization(epsilon=2e-5)(input_tensor) x = layers.Activation('relu')(x) # defining shortcut connection if cut == 'pre': shortcut = input_tensor elif cut == 'post': shortcut = layers.Conv2D(filters, (1, 1), strides=strides, kernel_initializer='he_uniform', use_bias=False)(x) # continue with convolution layers x = layers.ZeroPadding2D(padding=(1, 1))(x) x = layers.Conv2D(filters, (3, 3), strides=strides, kernel_initializer='he_uniform', use_bias=False)(x) x = layers.BatchNormalization(epsilon=2e-5)(x) x = layers.Activation('relu')(x) x = layers.ZeroPadding2D(padding=(1, 1))(x) x = layers.Conv2D(filters, (3, 3), kernel_initializer='he_uniform', use_bias=False)(x) # add residual connection x = layers.Add()([x, shortcut]) return x return layer def ResNet18(input_shape=None): """Instantiates the ResNet18 architecture.""" img_input = layers.Input(shape=input_shape, name='data') # ResNet18 bottom x = layers.BatchNormalization(epsilon=2e-5, scale=False)(img_input) x = layers.ZeroPadding2D(padding=(3, 3))(x) x = layers.Conv2D(64, (7, 7), strides=(2, 2), kernel_initializer='he_uniform', use_bias=False)(x) x = layers.BatchNormalization(epsilon=2e-5)(x) x = layers.Activation('relu')(x) x = layers.ZeroPadding2D(padding=(1, 1))(x) x = layers.MaxPooling2D((3, 3), strides=(2, 2), padding='valid')(x) # ResNet18 body repetitions = (2, 2, 2, 2) for stage, rep in enumerate(repetitions): for block in range(rep): filters = 64 * (2 ** stage) if block == 0 and stage == 0: x = residual_conv_block(filters, stage, block, strides=(1, 1), cut='post')(x) elif block == 0: x = residual_conv_block(filters, stage, block, strides=(2, 2), cut='post')(x) else: x = residual_conv_block(filters, stage, block, strides=(1, 1), cut='pre')(x) x = layers.BatchNormalization(epsilon=2e-5)(x) x = layers.Activation('relu')(x) # ResNet18 top x = layers.GlobalAveragePooling2D()(x) x = layers.Dense(NUM_CLASSES)(x) x = layers.Activation('softmax')(x) # Create model model = models.Model(img_input, x) return model IMG_SHAPE = IMG_SIZE + (3,) model = ResNet18(input_shape=IMG_SHAPE) # Pre-train Floating-Point Model # Load the floating-point weights model.load_weights(fp32_h5_path) # Compile the floating-point model model.compile(loss=tf.keras.losses.CategoricalCrossentropy(label_smoothing=0.1), metrics=[tf.keras.metrics.CategoricalAccuracy(name='acc@1')]) # Validate the floating-point model test_loss, acc_fp32 = model.evaluate(validation_dataset, callbacks=tf.keras.callbacks.ProgbarLogger(stateful_metrics=['acc@1'])) print(f"\nAccuracy of FP32 model: {acc_fp32:.3f}") model.save(fp32_sm_path) print(f'Absolute path where the model is saved:\n {fp32_sm_path.resolve()}') # Create and Initialize Quantization nncf_config_dict = { "input_info": {"sample_size": [1, 3] + list(IMG_SIZE)}, "log_dir": str(OUTPUT_DIR), # log directory for NNCF-specific logging outputs "compression": { "algorithm": "quantization", # specify the algorithm here }, } nncf_config = NNCFConfig.from_dict(nncf_config_dict) nncf_config = register_default_init_args(nncf_config=nncf_config, data_loader=train_dataset, batch_size=BATCH_SIZE) compression_ctrl, model = create_compressed_model(model, nncf_config) # Compile the int8 model model.compile(optimizer=tf.keras.optimizers.Adam(lr=LR), loss=tf.keras.losses.CategoricalCrossentropy(label_smoothing=0.1), metrics=[tf.keras.metrics.CategoricalAccuracy(name='acc@1')]) # Validate the int8 model test_loss, test_acc = model.evaluate(validation_dataset, callbacks=tf.keras.callbacks.ProgbarLogger(stateful_metrics=['acc@1'])) print(f"\nAccuracy of INT8 model after initialization: {test_acc:.3f}") # Fine-tune the Compressed Model # Train the int8 model model.fit(train_dataset, epochs=2) # Validate the int8 model test_loss, acc_int8 = model.evaluate(validation_dataset, callbacks=tf.keras.callbacks.ProgbarLogger(stateful_metrics=['acc@1'])) print(f"\nAccuracy of INT8 model after fine-tuning: {acc_int8:.3f}") print(f"\nAccuracy drop of tuned INT8 model over pre-trained FP32 model: {acc_fp32 - acc_int8:.3f}") compression_ctrl.export_model(int8_pb_path, 'frozen_graph') print(f'Absolute path where the int8 model is saved:\n {int8_pb_path.resolve()}') # Export Frozen Graph Models to OpenVINO Intermediate Representation (IR) !mo --framework=tf --input_shape=[1,64,64,3] --input=data --saved_model_dir=$fp32_sm_path --output_dir=$OUTPUT_DIR !mo --framework=tf --input_shape=[1,64,64,3] --input=Placeholder --input_model=$int8_pb_path --output_dir=$OUTPUT_DIR # Benchmark Model Performance by Computing Inference Time def parse_benchmark_output(benchmark_output): parsed_output = [line for line in benchmark_output if not (line.startswith(r"[") or line.startswith(" ") or line == "")] print(*parsed_output, sep='\n') print('Benchmark FP32 model (IR)') benchmark_output = ! benchmark_app -m $fp32_ir_path -d CPU -api async -t 15 parse_benchmark_output(benchmark_output) print('\nBenchmark INT8 model (IR)') benchmark_output = ! benchmark_app -m $int8_ir_path -d CPU -api async -t 15 parse_benchmark_output(benchmark_output) # Show CPU Information for reference from openvino.runtime import Core ie = Core() ie.get_property(device_name='CPU', name="FULL_DEVICE_NAME")
Sat Jun 18 2022 20:53:15 GMT+0000 (Coordinated Universal Time) https://github.com/openvinotoolkit/openvino_notebooks/blob/main/notebooks/302-pytorch-quantization-aware-training/302-pytorch-quantization-aware-training.ipynb
#python #openvino #openvino-notebooks #deeplearning #accelerated-inference #quantization #nncf #optimization #pytorchSat Jun 18 2022 20:47:05 GMT+0000 (Coordinated Universal Time) https://github.com/openvinotoolkit/openvino_notebooks/blob/main/notebooks/305-tensorflow-quantization-aware-training/305-tensorflow-quantization-aware-training.ipynb
#python #openvino #openvino-notebooks #deeplearning #accelerated-inference #tensorflow #quantization #nncf #optimization