transit-lang-cmp icon indicating copy to clipboard operation
transit-lang-cmp copied to clipboard

Branch for Nim? Here's a start for you (or someone)

Open c-blake opened this issue 1 year ago • 0 comments

For just the loading/parsing part this is 4.7x faster than Rust-1.64 on one of my machines (single threaded). Dunno about all the webserver/json stuff as that is not really my interest (but I do realize is the main point of the comparison).

import std/[times, stats, tables], cligen/[mfile, mslice]
type
  Tab        = Table[MSlice, seq[int]]
  StopTm     = tuple[tripId, stopId, arrival, departure: MSlice]
  Trip       = tuple[tripId, routeId, serviceId: MSlice]
  SchedReply = tuple[stopId, arrival, departure: MSlice] # schedules
  TripReply  = tuple[tripId, serviceId, routeId: MSlice; scheds:seq[SchedReply]]

template mkGet(name, R, path, nMax, nGuess, tag, record, header) =
  proc name(): (seq[R], Tab) =
    var col {.inject.}: seq[MSlice]
    let sep = initSep(",")
    let t0 = epochTime()
    var i = 0
    for row in mSlices("../MBTA_GTFS/" & path, eat='\0', keep=true):
      if i == 0:
        if not row.startsWith(header):
          raise newException(IOError, path & " not in expected format")
#       else: result[0]=newSeqOfCap[R](nGuess) #fileSize/avgLnLen, but unhelpful
      else:
        sep.split row, col, nMax    # discard row.msplit(col, ',', nMax) slower
        let id {.inject.} = col[0]
        result[0].add record
        result[1].mgetOrPut(id, @[]).add i
      inc i
    echo "parsed ",result[0].len," ",tag," in ",epochTime() - t0

mkGet(getStopTms, StopTm, "stop_times.txt", 5, 1_000_000, "stop times",
  (id, col[3], col[1], col[2]), "trip_id,arrival_time,departure_time,stop_id,")

mkGet(getTrips, Trip, "trips.txt", 4, 70_000, "trips",
  (col[2], id, col[1]), "route_id,service_id,trip_id,")

iterator resps(route: MSlice, stopTms: seq[StopTm], trips: seq[Trip];
               stopTmsIxByTrip, tripIxsByRoute: Tab): TripReply =
  var res: TripReply # This could just build JSON string as it goes
  try:
    let tripIxs = tripIxsByRoute[route]
    for i in tripIxs:
      res = (trips[i].tripId, trips[i].serviceId, trips[i].routeId, @[])
      try:
        let stopTmIxs = stopTmsIxByTrip[trips[i].tripId]
        for j in stopTmIxs:
          let stJ = stopTms[j]
          res.scheds.add (stJ.stopId, stJ.arrival, stJ.departure)
      except KeyError: discard
      yield res
  except KeyError: discard

let (stopTms, stopTmsIxByTrip) = getStopTms() # Rust release build ~4.7X slower
let (trips, tripIxsByRoute) = getTrips()
var dts: RunningStat
for route in mSlices("/dev/stdin"): #XXX Really buncha webserver-json junk
  let t0 = epochTime()
  for r in resps(route, stopTms, trips, stopTmsIxByTrip, tripIxsByRoute):
    echo "trId: ",r.tripId," svId: ",r.serviceId," rId",r.routeId," nSch: ",
         r.scheds.len               #XXX Maybe more than just .len here
  dts.push epochTime() - t0
echo "latencies: ", dts

# if cligen is not installed then cg=--path:$HOME/pkg/cb/cg or similar
# nim c -d:lto -d:danger -d:useMalloc --passC:-march=native --panics:on $cg app
# awk -F, '{print $1}' <../MBTA_GTFS/trips.txt|tail -n+2|sort -u>inp; ./app <inp
# gives latencies: RunningStat(number of probes: 189
#                              max: 0.002188444137573242
#                              min: 2.86102294921875e-06
#                              sum: 0.06806111335754395
#                              mean: 0.0003601117108864762
#                              std deviation: 0.0004244546403409464)

If you worry about fancy self-professedly-not-a-standard RFC4180 quoted-escaped-CSV inputs then the best answer is a conversion program (either run just once like unzip or run on demand in a pipeline) along the lines of c2* in https://github.com/c-blake/nio/tree/main/utils. For me that runs in less wall time than split-parsing and so is, in effect, no extra wall time if >=2 CPUs are available. { Of course if parsing time itself is a problem then you could do it in parallel, but the std/tables you might most naturally choose to store into are not really parallel-safe. }

c-blake avatar Oct 26 '22 12:10 c-blake