a, b = torch.rand(d1, d2).to(device), torch.rand(d2, d3).to(device)
-sync
+sync()
start_time = time.perf_counter()
for k in range(nb_runs):
c = a @ b
-sync
+sync()
duration = time.perf_counter() - start_time
nb_flop = float(nb_runs * d1 * d2 * d3 * 2) # 1 multiply-and-add is 2 ops