zigzag
zigzag copied to clipboard
Reducing memory size leads to decrease in load cycles
Hello and thank you for creating this great open source tool!
I am currently trying to model my accelerator in ZigZag for DSE, but I encountered an issue when running ZigZag with different memory sizes. To check if my implementation is the problem, I recreated the issue with the Eyeriss implementation from the repository.
This is my test code for Eyeriss
from zigzag import api
import zigzag
import os
from zigzag.classes.hardware.architecture.memory_hierarchy import MemoryHierarchy
from zigzag.classes.hardware.architecture.memory_level import MemoryLevel
from zigzag.classes.hardware.architecture.operational_unit import Multiplier
from zigzag.classes.hardware.architecture.operational_array import MultiplierArray
from zigzag.classes.hardware.architecture.memory_instance import MemoryInstance
from zigzag.classes.hardware.architecture.accelerator import Accelerator
from zigzag.classes.hardware.architecture.core import Core
from zigzag.classes.stages import *
from zigzag.classes.cost_model.cost_model import CostModelEvaluation
from typing import Type
import re
from onnx import ModelProto
import pickle
from zigzag.visualization.results.plot_cme import bar_plot_cost_model_evaluations_breakdown
import numpy as np
import matplotlib.pyplot as plt
import itertools
from dataclasses import dataclass
@dataclass
class Eyeriss_config:
DIM: int = 4
mem0_kb: int = 8
mem1_kb: int = 64
mem2_kb: int = 1024
def __init__(self, **kwargs):
self.__dict__.update(kwargs)
def get_mem0_bits(self) -> int:
return self.mem0_kb * 8000
def get_mem1_bits(self) -> int:
return self.mem1_kb * 8000
def get_mem2_bits(self) -> int:
return self.mem2_kb * 8000
class Eyeriss():
def __init__(self, cfg):
self.cfg = cfg
cores = self.cores_eyeriss()
acc_name = "Eyeriss"
self.eyeriss = Accelerator(acc_name, cores)
def get(self):
return self.eyeriss
def memory_herarchy_eyeriss(self, multiplier_array, visualize=False):
"""Memory hierarchy variables"""
""" size=#bit, bw=(read bw, write bw), cost=(read word energy, write work energy) """
rf1 = MemoryInstance(
name="rf_64B",
size=512,
r_bw=8,
w_bw=8,
r_cost=1.0,
w_cost=1.5,
area=0.3,
r_port=1,
w_port=1,
rw_port=0,
latency=1,
) # rd E per bit 0.125
rf2 = MemoryInstance(
name="rf_16B",
size=128,
r_bw=24,
w_bw=24,
r_cost=1.5,
w_cost=2,
area=0.95,
r_port=1,
w_port=1,
rw_port=1,
latency=1,
) # rd E per bit 0.0625
# lb1 = MemoryInstance(name="sram_64KB", size=524288, r_bw=128, w_bw=128, r_cost=20, w_cost=25, area=6, r_port=1, w_port=1, rw_port=0, latency=1) # rd E per bit 0.16
lb2 = MemoryInstance(
name="sram_8KB",
size=self.cfg.get_mem0_bits(),
r_bw=128,
w_bw=128,
r_cost=10,
w_cost=15,
r_port=0,
area=3,
w_port=0,
rw_port=2,
latency=1,
) # rd E per bit 0.08
lb2_64KB = MemoryInstance(
name="sram_64KB",
size=self.cfg.get_mem1_bits(),
r_bw=128,
w_bw=128,
r_cost=20,
w_cost=25,
area=6,
r_port=1,
w_port=1,
rw_port=0,
latency=1,
) # rd E per bit 0.08
gb = MemoryInstance(
name="sram_1M",
size=self.cfg.get_mem2_bits(),
r_bw=384,
w_bw=384,
r_cost=100,
w_cost=130,
area=25,
r_port=0,
w_port=0,
rw_port=2,
latency=1,
) # rd E per bit 0.26
dram = MemoryInstance(
name="dram",
size=10000000000,
r_bw=64,
w_bw=64,
r_cost=1000,
w_cost=1000,
area=0,
r_port=0,
w_port=0,
rw_port=1,
latency=1,
) # rd E per bit 16
memory_hierarchy_graph = MemoryHierarchy(operational_array=multiplier_array)
"""
fh: from high = wr_in_by_high
fl: from low = wr_in_by_low
th: to high = rd_out_to_high
tl: to low = rd_out_to_low
"""
memory_hierarchy_graph.add_memory(
memory_instance=rf1,
operands=("I1",),
port_alloc=({"fh": "w_port_1", "tl": "r_port_1", "fl": None, "th": None},),
served_dimensions=set(),
)
memory_hierarchy_graph.add_memory(
memory_instance=rf1,
operands=("I2",),
port_alloc=({"fh": "w_port_1", "tl": "r_port_1", "fl": None, "th": None},),
served_dimensions=set(),
)
memory_hierarchy_graph.add_memory(
memory_instance=rf2,
operands=("O",),
port_alloc=(
{"fh": "rw_port_1", "tl": "r_port_1", "fl": "w_port_1", "th": "rw_port_1"},
),
served_dimensions=set(),
)
memory_hierarchy_graph.add_memory(
memory_instance=lb2,
operands=("O",),
port_alloc=(
{
"fh": "rw_port_1",
"tl": "rw_port_2",
"fl": "rw_port_2",
"th": "rw_port_1",
},
),
served_dimensions="all",
)
memory_hierarchy_graph.add_memory(
memory_instance=lb2_64KB,
operands=("I2",),
port_alloc=({"fh": "w_port_1", "tl": "r_port_1", "fl": None, "th": None},),
served_dimensions="all",
)
memory_hierarchy_graph.add_memory(
memory_instance=gb,
operands=("I1", "O"),
port_alloc=(
{"fh": "rw_port_1", "tl": "rw_port_2", "fl": None, "th": None},
{
"fh": "rw_port_1",
"tl": "rw_port_2",
"fl": "rw_port_2",
"th": "rw_port_1",
},
),
served_dimensions="all",
)
memory_hierarchy_graph.add_memory(
memory_instance=dram,
operands=("I1", "I2", "O"),
port_alloc=(
{"fh": "rw_port_1", "tl": "rw_port_1", "fl": None, "th": None},
{"fh": "rw_port_1", "tl": "rw_port_1", "fl": None, "th": None},
{
"fh": "rw_port_1",
"tl": "rw_port_1",
"fl": "rw_port_1",
"th": "rw_port_1",
},
),
served_dimensions="all",
)
if visualize:
from zigzag.visualization.graph.memory_hierarchy import (
visualize_memory_hierarchy_graph,
)
visualize_memory_hierarchy_graph(memory_hierarchy_graph)
return memory_hierarchy_graph
def systolic_array_eyeriss(self):
"""Multiplier array variables"""
multiplier_input_precision = [8, 8]
multiplier_energy = 0.5
multiplier_area = 0.1
dimensions = {"D1": self.cfg.DIM, "D2": self.cfg.DIM}
multiplier = Multiplier(
multiplier_input_precision, multiplier_energy, multiplier_area
)
multiplier_array = MultiplierArray(multiplier, dimensions)
return multiplier_array
def cores_eyeriss(self):
multiplier_array1 = self.systolic_array_eyeriss()
memory_hierarchy1 = self.memory_herarchy_eyeriss(multiplier_array1)
core1 = Core(1, multiplier_array1, memory_hierarchy1)
return {core1}
def get_mapping(self):
mapping = {
"default": {
"core_allocation": 1,
"spatial_mapping": {"D1": ("K", self.cfg.DIM), "D2": ("C", self.cfg.DIM)},
"memory_operand_links": {"O": "O", "W": "I2", "I": "I1"},
},
"GEMM": {
"core_allocation": 1,
"spatial_mapping": {"D1": ("K", self.cfg.DIM), "D2": ("M", self.cfg.DIM)},
"memory_operand_links": {"O": "O", "W": "I2", "I": "I1"},
},
"Add": {
"core_allocation": 1,
"spatial_mapping": {"D1": ("G", self.cfg.DIM), "D2": ("C", 1)},
"memory_operand_links": {"O": "O", "X": "I2", "Y": "I1"},
},
"Pooling": {
"core_allocation": 1,
"spatial_mapping": {"D1": ("G", self.cfg.DIM), "D2": ("C", 1)},
"memory_operand_links": {"O": "O", "W": "I2", "I": "I1"},
},
}
return mapping
M = 100
N = 100
K = 100
workload = {
0: { # conv1, stride 2
"operator_type": "Conv",
"equation": "O[b][k][oy][ox]+=W[k][c][fy][fx]*I[b][c][iy][ix]",
"dimension_relations": ["ix=2*ox+1*fx", "iy=2*oy+1*fy"],
"loop_dim_size": {
"B": M,
"K": N,
"C": K,
"OY": 1,
"OX": 1,
"FY": 1,
"FX": 1,
},
"operand_precision": {"O": 32, "O_final": 32, "W": 32, "I": 32},
"operand_source": {"W": [], "I": []},
"constant_operands": ["I", "W"],
},
1: { # GEMM
"operator_type": "GEMM",
"equation": "O[m][n]+=W[m][k]*I[k][n]",
"loop_dim_size": {
"M": M,
"N": N,
"K": K,
},
"operand_precision": {"O": 32, "O_final": 32, "W": 32, "I": 32},
"operand_source": {"W": [], "I": []},
"constant_operands": ["I", "W"],
},
}
eyeriss = Eyeriss(Eyeriss_config(DIM=4,mem0_kb=1, mem1_kb=2, mem2_kb=4))
#eyeriss = Eyeriss(Eyeriss_config(DIM=4,mem0_kb=8, mem1_kb=64, mem2_kb=1024))
energy, latency, cme = api.get_hardware_performance_zigzag(
workload,
eyeriss.get(),
eyeriss.get_mapping(),
opt="EDP",
dump_filename_pattern="outputs/{datetime}.json",
pickle_filename="outputs/list_of_cmes.pickle",
lpf_limit=6
)
# Load in the pickled list of CMEs
with open("outputs/list_of_cmes.pickle", 'rb') as fp:
cme_for_all_layers = pickle.load(fp)
# Plot all the layers and save to 'plot_all.png'
bar_plot_cost_model_evaluations_breakdown(cme_for_all_layers, save_path="outputs/plot_breakdown.png")
print(f"Energy: {energy} - Latency: {latency} - CME: {cme}")
print(cme_for_all_layers[1].latency_total2)
print(cme_for_all_layers[-1].data_loading_cycle)
Now when I run this with the default memory sizes: eyeriss = Eyeriss(Eyeriss_config(DIM=4,mem0_kb=8, mem1_kb=64,mem2_kb=1024))
I get:
latency_total2 = 260519
data_loading_cycle = 5020
But when I run it with: eyeriss = Eyeriss(Eyeriss_config(DIM=4,mem0_kb=1, mem1_kb=2, mem2_kb=4))
I get:
latency_total2 = 250439
data_loading_cycle = 220
So decreasing the memory size leads to way fewer cycles for data loading, which seems very counter intuitive to me. I am using the latest version of ZigZag from the master branch. Maybe I messed up somewhere in my configuration?
However when looking at the evaluation breakdown for both configs, the energy usage goes way up (as I would expect):
eyeriss = Eyeriss(Eyeriss_config(DIM=4,mem0_kb=8, mem1_kb=64,mem2_kb=1024))
:
eyeriss = Eyeriss(Eyeriss_config(DIM=4,mem0_kb=1, mem1_kb=2, mem2_kb=4))
:
I would love some input on this, Best Regards, Lukas