speed.py

   1 #!/usr/bin/env python
   2
   3 import time, torch
   4
   5 if torch.cuda.is_available():
   6     device = torch.device('cuda')
   7     sync = lambda: torch.cuda.synchronize()
   8 else:
   9     device = torch.device('cpu')
  10     sync = lambda: None
  11
  12 nb_runs = 10000
  13 d1, d2, d3 = 2048, 2048, 2048
  14
  15 for t in [ torch.float32, torch.float16 ]:
  16     a = torch.rand(d1, d2, device = device, dtype = t)
  17     b = torch.rand(d2, d3, device = device, dtype = t)
  18
  19     sync()
  20     start_time = time.perf_counter()
  21     for k in range(nb_runs):
  22         c = a @ b
  23     sync()
  24     duration = time.perf_counter() - start_time
  25
  26     nb_flop = float(nb_runs * d1 * d2 * d3 * 2) # 1 multiply-and-add is 2 ops
  27     speed = nb_flop / duration
  28
  29     for u in [ '', 'K', 'M', 'G', 'T', 'P' ]:
  30         if speed < 1e3: break
  31         speed /= 1e3
  32
  33     print(f'{speed:.02f} {u}flops with {t} on {device}')