import tensorflow as tf, numpy as np, os, time, tf_keras as keras
from ai_edge_litert.interpreter import Interpreter, load_delegate
# Shape: (444, 160, 160, 3)
X_val = np.load('models/cats_X_val.npy')
# Shape: (444, 1) -> with class 1..6 -> scale to 0..5
y_val = np.load('models/cats_y_val.npy') - 1
# Load Keras model
model = keras.models.load_model("models/cats.keras")
# Calculate accuracy of the TF model
tf_start = time.perf_counter()
y_pred = model.predict(X_val)
tf_end = time.perf_counter()
preds = np.argmax(y_pred, axis=1)
acc_tf = (preds == y_val).mean()
print(f"TF/Keras accuracy: {acc_tf*100:.2f}% (time per inference: {(tf_end - tf_start) * 1000 / X_val.shape[0]:.4g}ms)")
print('')
# Convert to quantized TFLite file... Uses the dataset earlier as a representative dataset to improve accuracy.
TFLITE_FILE = 'cats_i8.tflite'
if not os.path.exists(TFLITE_FILE):
print(f'Converting to TFLite file ({TFLITE_FILE})...')
def rep_dataset():
for i in range(X_val.shape[0]):
yield [X_val[i:i+1]]
# Build a fixed batch=1 input signature (QNN cannot handle dynamic dims)
specs = []
for t in model.inputs:
if None in t.shape[1:]:
raise ValueError(f"Non-batch dims must be known; got {t.shape}")
specs.append(tf.TensorSpec([1, *t.shape[1:]], dtype=t.dtype, name=t.name.split(':')[0]))
@tf.function(input_signature=specs)
def serve(*xs):
y = model(*xs)
return y if isinstance(y, (tuple, list)) else (y,) # keep output order stable
concrete = serve.get_concrete_function()
converter = tf.lite.TFLiteConverter.from_concrete_functions([concrete], model)
converter.optimizations = [tf.lite.Optimize.DEFAULT]
converter.representative_dataset = rep_dataset
converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS_INT8]
converter.inference_input_type = tf.int8
converter.inference_output_type = tf.int8
tflite_model = converter.convert()
with open(TFLITE_FILE, "wb") as f:
f.write(tflite_model)
print(f"TFLite written: {TFLITE_FILE} ({os.path.getsize(TFLITE_FILE)/1e6:.2f} MB)")
else:
print(f'TFLite file already exists ({TFLITE_FILE})')
print('')
def run_tflite_model(model_path, use_npu):
# Use QNN to run this model on NPU
experimental_delegates = []
if use_npu:
experimental_delegates = [load_delegate("libQnnTFLiteDelegate.so", options={"backend_type": "htp"})]
# Get accuracy for the quantized TFLite file, construct the interpreter
interpreter = Interpreter(model_path=model_path, experimental_delegates=experimental_delegates)
interpreter.allocate_tensors()
in_details = interpreter.get_input_details()[0]
out_details = interpreter.get_output_details()[0]
# You need to scale the input / output yourself using quantization params
in_scale, in_zp = in_details["quantization"]
out_scale, out_zp = out_details["quantization"]
# Loop through one-by-one (most TFLite files have a fixed batch size of 1)
preds_tflite = []
tflite_start = time.perf_counter()
for i in range(X_val.shape[0]):
# Scale input and invoke
x = X_val[i:i+1]
x_q = np.round(x / in_scale + in_zp).astype(in_details['dtype'])
interpreter.set_tensor(in_details["index"], x_q)
interpreter.invoke()
# Scale output back to f32
out = interpreter.get_tensor(out_details["index"])
out = (out.astype(np.float32) - out_zp) * out_scale
# And add the outcome to the predictions
preds_tflite.append(np.argmax(out, axis=1)[0])
tflite_end = time.perf_counter()
# Compare accuracy in the same way as above
acc_tflite = (np.array(preds_tflite) == y_val).mean()
if use_npu:
print(f"Quantized TFLite accuracy (NPU): {acc_tflite*100:.2f}% (time per inference: {(tflite_end - tflite_start) * 1000 / X_val.shape[0]:.4g}ms)")
else:
print(f"Quantized TFLite accuracy (CPU): {acc_tflite*100:.2f}% (time per inference: {(tflite_end - tflite_start) * 1000 / X_val.shape[0]:.4g}ms)")
run_tflite_model(TFLITE_FILE, False)
run_tflite_model(TFLITE_FILE, True)