chipyard
chipyard copied to clipboard
Multi-Clock Support for NVDLA
Impact: rtl
Description Have the NVDLA accelerator be in a different clock domain than the uncore/tile clocks.
What is a motivating example for changing the behavior? Wanted in #505
Since multiclock support is being worked on right now, I'm interested in what the use case for this is. What clocks does the NVDLA want?
I'm really happy about the multiclock support being now integrated in Chipyard and I can see how that works for tiles, buses, and some other IPs. However, I am still not sure how you can move accelerators such as NVDLA and Gemmini into a different clock domain, without changing the frequency of a whole bus. I thought one would need to modify ClockSinkDomain
or ClockSinkNode
and add an async crossing type. However, there is no clock sink defined for Gemmini and NVDLA. Has anyone done this yet?
Hi @jerryz123 .Is there any new progress regarding NVDLA's support for multiple clocks?
This can be done now. The nvdla wrapper should be modified such that it instantiates the NVDLA within its own ClockSinkDomain. The ClockSinkDomain can be attached to the subsystem AsyncClockGrouosNode. And clock crossings should be added to the NVDLA's tilelink interfaces.
Do you have any relevant reference examples? I have just been in contact with Chipyard and I am not very familiar with diplomacy @jerryz123
Hi @adoerflinger ,have you solved this?
Hi @jerryz123 @abejgonzalez ,As jerryz123 said,I have modified the packaging code of NVDLA, where the NVDLA.scala file is as follows:
// See LICENSE for license details.
package nvidia.blocks.dla
import chisel3._
import freechips.rocketchip.config._
import freechips.rocketchip.diplomacy._
import freechips.rocketchip.amba.axi4._
import freechips.rocketchip.amba.apb._
import freechips.rocketchip.devices.tilelink.{BasicBusBlockerParams, TLClockBlocker}
import freechips.rocketchip.tilelink._
import freechips.rocketchip.interrupts._
import freechips.rocketchip.prci._
import freechips.rocketchip.subsystem._
import nvidia.blocks.ip.dla._
import sifive.blocks.util.{DeviceAttachParams, DeviceParams}
import freechips.rocketchip.regmapper._
case class NVDLAParams(
config: String,
raddress: BigInt=0x10040000L,
synthRAMs: Boolean = false,
dtsFrequency: BigInt = 15000000,
ramSize:BigInt = 1
) extends DeviceParams
class NVDLA(params: NVDLAParams)(implicit p: Parameters) extends LazyModule {
// def dtsFrequency = params.dtsFrequency
// def fixedClockOpt = dtsFrequency.map(f => ClockParameters(freqMHz = f.toDouble / 1000000.0))
val blackboxName = "nvdla_" + params.config
val hasSecondAXI = params.config == "large"
val dataWidthAXI = if (params.config == "large") 256 else 64
// DTS
val dtsdevice = new SimpleDevice("nvdla",Seq("nvidia,nv_" + params.config))
// dbb TL
val dbb_tl_node = TLIdentityNode()
// dbb AXI
val dbb_axi_node = AXI4MasterNode(
Seq(
AXI4MasterPortParameters(
masters = Seq(AXI4MasterParameters(
name = "NVDLA DBB",
id = IdRange(0, 256))))))
// TL <-> AXI
(dbb_tl_node
:= TLBuffer()
:= TLWidthWidget(dataWidthAXI/8)
:= AXI4ToTL()
:= AXI4UserYanker(capMaxFlight=Some(16))
:= AXI4Fragmenter()
:= AXI4IdIndexer(idBits=3)
:= AXI4Buffer()
:= dbb_axi_node)
// cvsram AXI
val cvsram_axi_node = if (hasSecondAXI) Some(AXI4MasterNode(
Seq(
AXI4MasterPortParameters(
masters = Seq(AXI4MasterParameters(
name = "NVDLA CVSRAM",
id = IdRange(0, 256)))))))
else None
cvsram_axi_node.foreach {
val sram = if (hasSecondAXI) Some(LazyModule(new AXI4RAM(
address = AddressSet(0, params.ramSize*1024-1),
beatBytes = dataWidthAXI/8)))
else None
sram.get.node := _
}
// cfg APB
val cfg_apb_node = APBSlaveNode(
Seq(
APBSlavePortParameters(
slaves = Seq(APBSlaveParameters(
address = Seq(AddressSet(params.raddress, 0x40000L-1L)), // 256KB
resources = dtsdevice.reg("control"),
executable = false,
supportsWrite = true,
supportsRead = true)),
beatBytes = 4)))
val cfg_tl_node = cfg_apb_node := LazyModule(new TLToAPB).node
val int_node = IntSourceNode(IntSourcePortSimple(num = 1, resources = dtsdevice.int))
lazy val module = new LazyModuleImp(this) {
val u_nvdla = Module(new nvdla(params.config, blackboxName, hasSecondAXI, dataWidthAXI, params.synthRAMs))
u_nvdla.io.core_clk := clock
u_nvdla.io.rstn := ~reset.asBool
u_nvdla.io.csb_rstn := ~reset.asBool
val (dbb, _) = dbb_axi_node.out(0)
dbb.aw.valid := u_nvdla.io.nvdla_core2dbb_aw_awvalid
u_nvdla.io.nvdla_core2dbb_aw_awready := dbb.aw.ready
dbb.aw.bits.id := u_nvdla.io.nvdla_core2dbb_aw_awid
dbb.aw.bits.len := u_nvdla.io.nvdla_core2dbb_aw_awlen
dbb.aw.bits.size := u_nvdla.io.nvdla_core2dbb_aw_awsize
dbb.aw.bits.addr := u_nvdla.io.nvdla_core2dbb_aw_awaddr
dbb.w.valid := u_nvdla.io.nvdla_core2dbb_w_wvalid
u_nvdla.io.nvdla_core2dbb_w_wready := dbb.w.ready
dbb.w.bits.data := u_nvdla.io.nvdla_core2dbb_w_wdata
dbb.w.bits.strb := u_nvdla.io.nvdla_core2dbb_w_wstrb
dbb.w.bits.last := u_nvdla.io.nvdla_core2dbb_w_wlast
dbb.ar.valid := u_nvdla.io.nvdla_core2dbb_ar_arvalid
u_nvdla.io.nvdla_core2dbb_ar_arready := dbb.ar.ready
dbb.ar.bits.id := u_nvdla.io.nvdla_core2dbb_ar_arid
dbb.ar.bits.len := u_nvdla.io.nvdla_core2dbb_ar_arlen
dbb.ar.bits.size := u_nvdla.io.nvdla_core2dbb_ar_arsize
dbb.ar.bits.addr := u_nvdla.io.nvdla_core2dbb_ar_araddr
u_nvdla.io.nvdla_core2dbb_b_bvalid := dbb.b.valid
dbb.b.ready := u_nvdla.io.nvdla_core2dbb_b_bready
u_nvdla.io.nvdla_core2dbb_b_bid := dbb.b.bits.id
u_nvdla.io.nvdla_core2dbb_r_rvalid := dbb.r.valid
dbb.r.ready := u_nvdla.io.nvdla_core2dbb_r_rready
u_nvdla.io.nvdla_core2dbb_r_rid := dbb.r.bits.id
u_nvdla.io.nvdla_core2dbb_r_rlast := dbb.r.bits.last
u_nvdla.io.nvdla_core2dbb_r_rdata := dbb.r.bits.data
u_nvdla.io.nvdla_core2cvsram.foreach { u_nvdla_cvsram =>
val (cvsram, _) = cvsram_axi_node.get.out(0)
cvsram.aw.valid := u_nvdla_cvsram.aw_awvalid
u_nvdla_cvsram.aw_awready := cvsram.aw.ready
cvsram.aw.bits.id := u_nvdla_cvsram.aw_awid
cvsram.aw.bits.len := u_nvdla_cvsram.aw_awlen
cvsram.aw.bits.size := u_nvdla_cvsram.aw_awsize
cvsram.aw.bits.addr := u_nvdla_cvsram.aw_awaddr
cvsram.w.valid := u_nvdla_cvsram.w_wvalid
u_nvdla_cvsram.w_wready := cvsram.w.ready
cvsram.w.bits.data := u_nvdla_cvsram.w_wdata
cvsram.w.bits.strb := u_nvdla_cvsram.w_wstrb
cvsram.w.bits.last := u_nvdla_cvsram.w_wlast
cvsram.ar.valid := u_nvdla_cvsram.ar_arvalid
u_nvdla_cvsram.ar_arready := cvsram.ar.ready
cvsram.ar.bits.id := u_nvdla_cvsram.ar_arid
cvsram.ar.bits.len := u_nvdla_cvsram.ar_arlen
cvsram.ar.bits.size := u_nvdla_cvsram.ar_arsize
cvsram.ar.bits.addr := u_nvdla_cvsram.ar_araddr
u_nvdla_cvsram.b_bvalid := cvsram.b.valid
cvsram.b.ready := u_nvdla_cvsram.b_bready
u_nvdla_cvsram.b_bid := cvsram.b.bits.id
u_nvdla_cvsram.r_rvalid := cvsram.r.valid
cvsram.r.ready := u_nvdla_cvsram.r_rready
u_nvdla_cvsram.r_rid := cvsram.r.bits.id
u_nvdla_cvsram.r_rlast := cvsram.r.bits.last
u_nvdla_cvsram.r_rdata := cvsram.r.bits.data
}
val (cfg, _) = cfg_apb_node.in(0)
u_nvdla.io.psel := cfg.psel
u_nvdla.io.penable := cfg.penable
u_nvdla.io.pwrite := cfg.pwrite
u_nvdla.io.paddr := cfg.paddr
u_nvdla.io.pwdata := cfg.pwdata
cfg.prdata := u_nvdla.io.prdata
cfg.pready := u_nvdla.io.pready
cfg.pslverr := false.B
val (io_int, _) = int_node.out(0)
io_int(0) := u_nvdla.io.dla_intr
}
}
//class TLNVDLA(params: NVDLAParams)(implicit p: Parameters)
// extends NVDLA(params) with HasTLControlRegMap
case class NVDLAAttachParams(
device: NVDLAParams,
controlWhere: TLBusWrapperLocation = PBUS,
frontWhere:TLBusWrapperLocation = FBUS,
controlXType: ClockCrossingType = AsynchronousCrossing(),
frontXType:ClockCrossingType = AsynchronousCrossing(),
intXType: ClockCrossingType = AsynchronousCrossing()
) extends DeviceParams
{
def attachTo(where: Attachable)(implicit p: Parameters): NVDLA = where {
val name = s"nvdla_${device.config}"
val cbus = where.locateTLBusWrapper(controlWhere)
val fbus = where.locateTLBusWrapper(frontWhere)
val nvdlaClockDomainWrapper = LazyModule(new ClockSinkDomain(take = None))
val nvdla = nvdlaClockDomainWrapper { LazyModule(new NVDLA(device)) }
nvdla.suggestName(name)
cbus.coupleTo(s"device_named_$name") {
nvdlaClockDomainWrapper.clockNode := (controlXType match {
case _: SynchronousCrossing =>
cbus.dtsClk.map(_.bind(nvdla.dtsdevice))
cbus.fixedClockNode
case _: RationalCrossing =>
cbus.clockNode
case _: AsynchronousCrossing =>
val nvdlaClockGroup = ClockGroup()
nvdlaClockGroup := where.asyncClockGroupsNode
nvdlaClockGroup
})
nvdla.cfg_tl_node := TLFragmenter(4, cbus.blockBytes) := TLWidthWidget(cbus.beatBytes) := _
}
fbus.coupleFrom(s"device_named_$name") {
// nvdlaClockDomainWrapper.clockNode := (frontXType match {
// case _: SynchronousCrossing =>
// fbus.dtsClk.map(_.bind(nvdla.dtsdevice))
// fbus.fixedClockNode
// case _: RationalCrossing =>
// fbus.clockNode
// case _: AsynchronousCrossing =>
// val nvdlaClockGroup = ClockGroup()
// nvdlaClockGroup := where.asyncClockGroupsNode
// nvdlaClockGroup
// })
_ := TLFIFOFixer(TLFIFOFixer.all):= nvdla.dbb_tl_node
}
(intXType match {
case _: SynchronousCrossing => where.ibus.fromSync
case _: RationalCrossing => where.ibus.fromRational
case _: AsynchronousCrossing => where.ibus.fromAsync
}) := nvdla.int_node
nvdla
}
}
The Periphery.scala file is as follows:
import chisel3._
import freechips.rocketchip.config.Field
import freechips.rocketchip.subsystem.BaseSubsystem
import freechips.rocketchip.diplomacy.{BufferParams, LazyModule, LazyModuleImp}
import freechips.rocketchip.tilelink.{TLBuffer, TLIdentityNode}
case object NVDLAKey extends Field[Option[NVDLAParams]](None)
case object NVDLAFrontBusExtraBuffers extends Field[Int](0)
trait CanHavePeripheryNVDLA { this: BaseSubsystem =>
val nvdla = p(NVDLAKey).map { case key =>
NVDLAAttachParams(key).attachTo(this)
}
}
It looks like there's no problem, and Verilog code can also be generated normally, but I don't know how to define the frequency of this clock group. May I ask how to define the frequency?