transit-lang-cmp
transit-lang-cmp copied to clipboard
Branch for Nim? Here's a start for you (or someone)
For just the loading/parsing part this is 4.7x faster than Rust-1.64 on one of my machines (single threaded). Dunno about all the webserver/json stuff as that is not really my interest (but I do realize is the main point of the comparison).
import std/[times, stats, tables], cligen/[mfile, mslice]
type
Tab = Table[MSlice, seq[int]]
StopTm = tuple[tripId, stopId, arrival, departure: MSlice]
Trip = tuple[tripId, routeId, serviceId: MSlice]
SchedReply = tuple[stopId, arrival, departure: MSlice] # schedules
TripReply = tuple[tripId, serviceId, routeId: MSlice; scheds:seq[SchedReply]]
template mkGet(name, R, path, nMax, nGuess, tag, record, header) =
proc name(): (seq[R], Tab) =
var col {.inject.}: seq[MSlice]
let sep = initSep(",")
let t0 = epochTime()
var i = 0
for row in mSlices("../MBTA_GTFS/" & path, eat='\0', keep=true):
if i == 0:
if not row.startsWith(header):
raise newException(IOError, path & " not in expected format")
# else: result[0]=newSeqOfCap[R](nGuess) #fileSize/avgLnLen, but unhelpful
else:
sep.split row, col, nMax # discard row.msplit(col, ',', nMax) slower
let id {.inject.} = col[0]
result[0].add record
result[1].mgetOrPut(id, @[]).add i
inc i
echo "parsed ",result[0].len," ",tag," in ",epochTime() - t0
mkGet(getStopTms, StopTm, "stop_times.txt", 5, 1_000_000, "stop times",
(id, col[3], col[1], col[2]), "trip_id,arrival_time,departure_time,stop_id,")
mkGet(getTrips, Trip, "trips.txt", 4, 70_000, "trips",
(col[2], id, col[1]), "route_id,service_id,trip_id,")
iterator resps(route: MSlice, stopTms: seq[StopTm], trips: seq[Trip];
stopTmsIxByTrip, tripIxsByRoute: Tab): TripReply =
var res: TripReply # This could just build JSON string as it goes
try:
let tripIxs = tripIxsByRoute[route]
for i in tripIxs:
res = (trips[i].tripId, trips[i].serviceId, trips[i].routeId, @[])
try:
let stopTmIxs = stopTmsIxByTrip[trips[i].tripId]
for j in stopTmIxs:
let stJ = stopTms[j]
res.scheds.add (stJ.stopId, stJ.arrival, stJ.departure)
except KeyError: discard
yield res
except KeyError: discard
let (stopTms, stopTmsIxByTrip) = getStopTms() # Rust release build ~4.7X slower
let (trips, tripIxsByRoute) = getTrips()
var dts: RunningStat
for route in mSlices("/dev/stdin"): #XXX Really buncha webserver-json junk
let t0 = epochTime()
for r in resps(route, stopTms, trips, stopTmsIxByTrip, tripIxsByRoute):
echo "trId: ",r.tripId," svId: ",r.serviceId," rId",r.routeId," nSch: ",
r.scheds.len #XXX Maybe more than just .len here
dts.push epochTime() - t0
echo "latencies: ", dts
# if cligen is not installed then cg=--path:$HOME/pkg/cb/cg or similar
# nim c -d:lto -d:danger -d:useMalloc --passC:-march=native --panics:on $cg app
# awk -F, '{print $1}' <../MBTA_GTFS/trips.txt|tail -n+2|sort -u>inp; ./app <inp
# gives latencies: RunningStat(number of probes: 189
# max: 0.002188444137573242
# min: 2.86102294921875e-06
# sum: 0.06806111335754395
# mean: 0.0003601117108864762
# std deviation: 0.0004244546403409464)
If you worry about fancy self-professedly-not-a-standard RFC4180 quoted-escaped-CSV inputs then the best answer is a conversion program (either run just once like unzip
or run on demand in a pipeline) along the lines of c2* in https://github.com/c-blake/nio/tree/main/utils. For me that runs in less wall time than split-parsing and so is, in effect, no extra wall time if >=2 CPUs are available. { Of course if parsing time itself is a problem then you could do it in parallel, but the std/tables you might most naturally choose to store into are not really parallel-safe. }