Content transfer takes longer on cache hit
Expected Behavior
Content transfer time should be similar between cache hits and misses
Current Behavior
After first byte, it takes longer to transfer cached content than non-cached content
Possible Solution
No response
Steps to Reproduce (for bugs)
- Clear varnish cache
- Make a number of requests to warm the cache, observe the content download timing
- Make the same requests again with a warm cache, content download time increases significantly
Context
I'm attempting to use varnish in front of imgproxy to cache resized images and avoid costly resizing when possible, but the time to download content takes much longer when there is a cache hit than miss, diminishing the time savings of using varnish
Running a NodeJS test with a cleared cache gives these results
Whereas re-running the test with the cache warmed gives
as average timings for each of 32 requests sent. While varnish is overall faster, it seems unexpected that content transfer time would 2x on cache hit.
Varnish is running in a k8s cluster next to imgproxy and the underlying data store. There is an nginx ingress fronting varnish, though that stays stable between tests. Timing was determined using code from https://github.com/RisingStack/example-http-timings/blob/master/app.js
Full test code (with sensitive data removed (please ignore messy js)) is
const images = [<LIST_OF_IMAGE_URLS_TO_PASS_TO_IMGPROXY>];
'use strict'
const assert = require('assert')
const url = require('url')
const http = require('http')
const https = require('https')
const TIMEOUT_IN_MILLISECONDS = 30 * 1000
const NS_PER_SEC = 1e9
const MS_PER_NS = 1e6
/**
* Creates a request and collects HTTP timings
* @function request
* @param {Object} options
* @param {String} [options.method='GET']
* @param {String} options.protocol
* @param {String} options.hostname
* @param {Number} [options.port]
* @param {String} [options.path]
* @param {Object} [options.headers={}]
* @param {String} [options.body]
* @param {Function} callback
*/
function request ({
method = 'GET',
protocol,
hostname,
port,
path,
headers = {},
body
} = {}, callback) {
// Validation
assert(protocol, 'options.protocol is required')
assert(['http:', 'https:'].includes(protocol), 'options.protocol must be one of: "http:", "https:"')
assert(hostname, 'options.hostname is required')
assert(callback, 'callback is required')
// Initialization
const eventTimes = {
// use process.hrtime() as it's not a subject of clock drift
startAt: process.hrtime(),
dnsLookupAt: undefined,
tcpConnectionAt: undefined,
tlsHandshakeAt: undefined,
firstByteAt: undefined,
endAt: undefined
}
// Making request
const req = (protocol.startsWith('https') ? https : http).request({
protocol,
method,
hostname,
port,
path,
headers
}, (res) => {
let responseBody = ''
req.setTimeout(TIMEOUT_IN_MILLISECONDS)
// Response events
res.once('readable', () => {
eventTimes.firstByteAt = process.hrtime()
})
res.on('data', (chunk) => { responseBody += chunk })
// End event is not emitted when stream is not consumed fully
// in our case we consume it see: res.on('data')
res.on('end', () => {
eventTimes.endAt = process.hrtime()
callback(null, {
headers: res.headers,
timings: getTimings(eventTimes),
body: responseBody
})
})
})
// Request events
req.on('socket', (socket) => {
socket.on('lookup', () => {
eventTimes.dnsLookupAt = process.hrtime()
})
socket.on('connect', () => {
eventTimes.tcpConnectionAt = process.hrtime()
})
socket.on('secureConnect', () => {
eventTimes.tlsHandshakeAt = process.hrtime()
})
socket.on('timeout', () => {
req.abort()
const err = new Error('ETIMEDOUT')
err.code = 'ETIMEDOUT'
callback(err)
})
})
req.on('error', callback)
// Sending body
if (body) {
req.write(body)
}
req.end()
}
/**
* Calculates HTTP timings
* @function getTimings
* @param {Object} eventTimes
* @param {Number} eventTimes.startAt
* @param {Number|undefined} eventTimes.dnsLookupAt
* @param {Number} eventTimes.tcpConnectionAt
* @param {Number|undefined} eventTimes.tlsHandshakeAt
* @param {Number} eventTimes.firstByteAt
* @param {Number} eventTimes.endAt
* @return {Object} timings - { dnsLookup, tcpConnection, tlsHandshake, firstByte, contentTransfer, total }
*/
function getTimings (eventTimes) {
return {
// There is no DNS lookup with IP address
dnsLookup: eventTimes.dnsLookupAt !== undefined ?
getHrTimeDurationInMs(eventTimes.startAt, eventTimes.dnsLookupAt) : undefined,
tcpConnection: getHrTimeDurationInMs(eventTimes.dnsLookupAt || eventTimes.startAt, eventTimes.tcpConnectionAt),
// There is no TLS handshake without https
tlsHandshake: eventTimes.tlsHandshakeAt !== undefined ?
(getHrTimeDurationInMs(eventTimes.tcpConnectionAt, eventTimes.tlsHandshakeAt)) : undefined,
firstByte: getHrTimeDurationInMs((eventTimes.tlsHandshakeAt || eventTimes.tcpConnectionAt), eventTimes.firstByteAt),
contentTransfer: getHrTimeDurationInMs(eventTimes.firstByteAt, eventTimes.endAt),
total: getHrTimeDurationInMs(eventTimes.startAt, eventTimes.endAt)
}
}
/**
* Get duration in milliseconds from process.hrtime()
* @function getHrTimeDurationInMs
* @param {Array} startTime - [seconds, nanoseconds]
* @param {Array} endTime - [seconds, nanoseconds]
* @return {Number} durationInMs
*/
function getHrTimeDurationInMs (startTime, endTime) {
const secondDiff = endTime[0] - startTime[0]
const nanoSecondDiff = endTime[1] - startTime[1]
const diffInNanoSecond = secondDiff * NS_PER_SEC + nanoSecondDiff
return diffInNanoSecond / MS_PER_NS
}
const runTest = async (headers) => {
const timingAverages = {
dnsLookup: 0,
tcpConnection: 0,
tlsHandshake: 0,
firstByte: 0,
contentTransfer: 0,
total: 0
}
let totalRequests = 0;
const cb = (err, res) => {
if (err) {
console.log(err);
return;
}
totalRequests += 1;
if (timingAverages.total === 0) {
Object.assign(timingAverages, res.timings);
return;
}
const timings = res.timings;
timingAverages.dnsLookup = (timingAverages.dnsLookup + timings.dnsLookup) / 2
timingAverages.tcpConnection = (timingAverages.tcpConnection + timings.tcpConnection) / 2
timingAverages.tlsHandshake = (timingAverages.tlsHandshake + timings.tlsHandshake) / 2
timingAverages.firstByte = (timingAverages.firstByte + timings.firstByte) / 2
timingAverages.contentTransfer = (timingAverages.contentTransfer + timings.contentTransfer) / 2
timingAverages.total = (timingAverages.total + timings.total) / 2
if (totalRequests === images.length) {
console.log(timingAverages)
}
}
for (const image of images) {
request(Object.assign(url.parse(`<VARNISH_URL>/rs:fit:1024:1024:1/plain/${image}`), {
headers
}), cb);
}
await new Promise((resolve) => setInterval(() => {if (totalRequests === images.length) {resolve()}}, 100))
}
const testNoCache = async () => {
return runTest({
'Cache-Control': 'max-age=0'
})
};
const testCache = async () => {
return runTest({
'Cache-Control': 'public, max-age=65535'
})
}
(async () => {
await testNoCache();
await testCache();
})()
and default.vcl is
vcl 4.1;
import directors;
import std;
backend imgproxy {
.host = "<IMAGE_PROXY_K8S_SERVICE>";
.port = "8080";
}
sub vcl_recv {
if (req.http.host == "<VARNISH_HOST>") {
set req.backend_hint = imgproxy;
// Ignore imgproxy url signing for now
set req.url = "/insecure" + req.url;
set req.url = regsub(req.url, "<OBJECT_STORE_URL>", "<OBJECT_STORE_K8S_SERVICE>");
// Downgrade https requests to http as we are behind nginx ingress already
set req.url = regsub(req.url, "https://<OBJECT_STORE_K8S_SERVICE>", "http://<OBJECT_STORE_K8S_SERVICE>");
unset req.http.Cookie;
}
}
Varnish Cache version
varnishd (varnish-7.4.2 revision cd1d10ab53a6f6115b2b4f3b2a1da94c1f749f80)
Operating system
Rancher K8s on Ubuntu 20.04
Source of binary packages used (if any)
Docker hub
Could you please follow these steps?
- start varnishd
- perform your tests
- run
varnishlog -w tests.vsl -d -g raw - upload
tests.vslsomewhere
Here's the log from the test run with lorempicsum photos, just changed host name of internal hosts to <VARNISH_HOST> (and changed .vsl to .log to allow gh uploading). tests.log
Unfortunately the file is unreadable, could you please upload a new one using the following command?
varnishlog -w tests.log -d -g raw -C -X '*Header:^host:'
This command should capture the logs omitting host headers, please let me know if that is not the case. It looks like you redacted the host names directly on the binary file, which broke the file format.
Alternatively, if you still have the original unmodified test.vsl you can do this without needing to perform a new test:
varnishlog -w tests.vsl -w tests.log -g raw -C -X '*Header:^host:'
This one should read your log dump and made a copy with host headers filtered out.
Putting aside the question of whether or not NodeJS is a good choice to properly calculate benchmarks at all, I think there is a methodological issue here:
timingAverages.contentTransfer = (timingAverages.contentTransfer + timings.contentTransfer) / 2
You seem to be calculating a moving average here, where the last value attributes for 50% of the value, the previous one 25% and so on. Thus, the values reported have very limited statistical significance. You should at least calculate an average (sum up a total and a count of measurements, then divide one by the other) or report percentiles. Note: I am a statistics noob and others would have far more elaborate answers.
Also, you can not just arbitrarily edit VSL files without rendering them unusable to our tooling. If you absolutely need to, you can, in principle, replace a specific string with one by exactly the same length, but changing the length of substrings will break the VSL format.
Heres the corrected tests.log, and I appreciate the help, this is my first time using varnish and I'm still not used to the intricacies tests.log
As for the concern around utilizing NodeJS, this was spun out of real-world observations, and I do believe the timing isn't that far off to cause 2x readings for one test vs another. The same goes for utilizing a rolling average, but to assuage concerns I rewrote part of the test like
const runTest = async (headers) => {
const timingAverages = {
dnsLookup: [],
tcpConnection: [],
tlsHandshake: [],
firstByte: [],
contentTransfer: [],
total: []
}
let totalRequests = 0;
const cb = (err, res) => {
if (err) {
console.log(err);
return;
}
totalRequests += 1;
const timings = res.timings;
timingAverages.dnsLookup.push(timings.dnsLookup)
timingAverages.tcpConnection.push(timings.tcpConnection)
timingAverages.tlsHandshake.push(timings.tlsHandshake)
timingAverages.firstByte.push(timings.firstByte)
timingAverages.contentTransfer.push(timings.contentTransfer)
timingAverages.total.push(timings.total)
if (totalRequests === images.length) {
console.log(timingAverages)
timingAverages.dnsLookup = timingAverages.dnsLookup.reduce((pv, cv) => pv + cv, 0) / timingAverages.dnsLookup.length;
timingAverages.tcpConnection = timingAverages.tcpConnection.reduce((pv, cv) => pv + cv, 0) / timingAverages.tcpConnection.length;
timingAverages.tlsHandshake = timingAverages.tlsHandshake.reduce((pv, cv) => pv + cv, 0) / timingAverages.tlsHandshake.length;
timingAverages.firstByte = timingAverages.firstByte.reduce((pv, cv) => pv + cv, 0) / timingAverages.firstByte.length;
timingAverages.contentTransfer = timingAverages.contentTransfer.reduce((pv, cv) => pv + cv, 0) / timingAverages.contentTransfer.length;
timingAverages.total = timingAverages.total.reduce((pv, cv) => pv + cv, 0) / timingAverages.total.length;
console.log(timingAverages)
}
}
which gives each individual readings as well as an average, and the results look like this for a cold cache
{
dnsLookup: [
19.93784, 28.07454, 33.73065,
49.773769, 49.608095, 185.92749,
185.845296, 63.645777, 252.009178,
33.445176, 233.147929, 63.427909,
92.640037, 154.376359, 92.418847,
79.524635, 78.890125, 106.703612,
107.129597, 123.577273, 122.843716,
136.439182, 136.458035, 154.434705,
206.588555, 170.214109, 170.566701,
206.964097, 215.875967, 233.467083,
251.930575, 216.031609
],
tcpConnection: [
61.789487, 61.164254, 59.555189,
56.627942, 59.769154, 67.903728,
68.032112, 60.885966, 63.671877,
62.399991, 61.407015, 60.381059,
61.291969, 64.316233, 61.486067,
59.003847, 58.11804, 60.334,
60.637711, 57.456184, 63.586648,
60.418539, 59.777141, 60.344555,
63.165154, 62.619101, 65.957661,
64.468462, 61.509323, 64.626933,
63.244574, 61.32352
],
tlsHandshake: [
131.528177, 131.027368, 134.09194,
134.929551, 132.524353, 132.049482,
132.717415, 136.469747, 134.731764,
137.039744, 122.696647, 129.606645,
137.040508, 132.82508, 137.61966,
138.155856, 135.369672, 126.030985,
139.387236, 127.508853, 130.79425,
134.078286, 135.324556, 130.701035,
126.491662, 137.30185, 135.699126,
129.954115, 129.106353, 133.030783,
132.562853, 132.925048
],
firstByte: [
429.879613, 489.604641, 1379.527324,
2249.508739, 2338.470817, 2287.412858,
2508.532585, 2641.129194, 2548.869027,
3295.907005, 3352.883484, 3712.970083,
3808.411803, 3873.279871, 4167.281563,
4371.000475, 4880.835777, 5078.747737,
5073.98584, 5072.798888, 5069.92534,
5096.029239, 5095.605028, 5082.044114,
4824.831587, 5098.138493, 5096.914209,
5067.448072, 5069.456898, 5065.09173,
5096.810019, 5054.294952
],
contentTransfer: [
185.874166, 139.249603, 136.787592,
150.626073, 140.612529, 133.030364,
136.654004, 190.566206, 124.821471,
140.693461, 195.565545, 202.951778,
204.659713, 211.476559, 190.607662,
200.512238, 216.481054, 0.156129,
0.196418, 0.153952, 0.237125,
0.192032, 0.131228, 0.150658,
244.335089, 0.208147, 0.165334,
0.164052, 0.226009, 0.064772,
0.19143, 219.050892
],
total: [
829.009283, 849.120406, 1743.692695,
2641.466074, 2720.984948, 2806.323922,
3031.781412, 3092.69689, 3124.103317,
3669.485377, 3965.70062, 4169.337474,
4304.04403, 4436.274102, 4649.413799,
4848.197051, 5369.694668, 5371.972463,
5381.336802, 5381.49515, 5387.387079,
5427.157278, 5427.295988, 5427.675067,
5465.412047, 5468.4817, 5469.303031,
5468.998798, 5476.17455, 5496.281301,
5544.739451, 5683.626021
]
}
{
dnsLookup: 132.989014625,
tcpConnection: 61.789794875,
tlsHandshake: 132.85376875,
firstByte: 3883.6758439062496,
contentTransfer: 105.21229015625,
total: 4316.520712312501
}
while it looks like this with a warm
{
dnsLookup: [
45.096431, 17.548342, 30.140989,
29.361672, 45.165336, 15.195519,
62.77775, 62.700762, 78.795172,
79.234249, 98.175327, 98.571459,
206.170511, 205.897793, 177.512,
298.529712, 259.528888, 276.470497,
230.468349, 156.335343, 139.183351,
193.025498, 177.924663, 123.621447,
230.695377, 139.244965, 192.937956,
298.396759, 259.371885, 123.989488,
276.809943, 156.672806
],
tcpConnection: [
64.089612, 67.189669, 60.757949,
62.34496, 57.808701, 62.477281,
59.360514, 59.497304, 64.657762,
64.853223, 58.971221, 59.22844,
72.07412, 71.686461, 62.375393,
72.20083, 71.139955, 71.595018,
74.530317, 69.307778, 64.791249,
64.613187, 74.218542, 67.067098,
74.91232, 61.302828, 64.867489,
71.859231, 70.756694, 66.591919,
72.609605, 69.3004
],
tlsHandshake: [
66.537938, 133.982176, 139.076164,
127.954057, 133.59868, 138.255908,
142.454908, 143.098861, 128.757382,
136.550904, 145.994982, 146.498379,
158.544484, 160.273312, 147.492856,
78.167797, 66.291941, 153.241899,
145.544286, 68.583907, 153.774298,
69.055217, 143.069149, 136.429262,
71.646476, 143.143541, 141.338993,
151.589382, 66.709555, 137.691997,
153.167338, 144.856469
],
firstByte: [
78.655131, 71.588653, 69.311978,
71.053851, 71.270913, 74.159171,
78.788136, 76.308675, 69.630887,
71.283787, 69.161124, 72.509248,
72.638633, 81.996677, 69.512148,
80.203901, 75.959751, 72.795485,
76.371615, 886.406136, 2971.690184,
3067.55421, 3419.933943, 3558.537114,
3673.620399, 3742.914859, 3898.308628,
3859.171898, 4049.649664, 4280.476296,
4206.517422, 4412.503068
],
contentTransfer: [
135.221546, 155.11207, 147.79899,
155.551035, 148.40266, 211.769714,
197.639815, 211.169207, 219.848684,
215.193694, 225.630533, 224.623418,
130.257422, 143.230278, 254.498134,
182.671126, 243.916342, 204.59074,
259.814462, 202.205542, 212.028139,
218.538249, 257.656505, 233.673702,
259.463878, 255.447961, 267.603949,
265.52986, 284.802727, 265.268678,
281.500461, 421.833118
],
total: [
389.600658, 445.42091, 447.08607,
446.265575, 456.24629, 501.857593,
541.021123, 552.774809, 561.689887,
567.115857, 597.933187, 601.430944,
639.68517, 663.084521, 711.390531,
711.773366, 716.836877, 778.693639,
786.729029, 1382.838706, 3541.467221,
3612.786361, 4072.802802, 4119.328623,
4310.33845, 4342.054154, 4565.057015,
4646.54713, 4731.290525, 4874.018378,
4990.604769, 5205.165861
]
}
{
dnsLookup: 149.54844496875,
tcpConnection: 66.5324084375,
tlsHandshake: 127.29289056249999,
firstByte: 1482.2026120312498,
contentTransfer: 221.64039496875,
total: 2047.21675096875
}
For the real-world experience, chrome network shows graphs like this for a cold cache (test is just all images in an html page)
with each blue block (content transfer time) being quite short
where the same with a warm cache looks like this
The numbers from the log do not reflect your reporting at all. For Cache misses, the total processing time (second last column) is in the multiple seconds range, while for cache hits it is at a millisecond or below, while "socket send" times (last column) are roughly comparable (we can not measure when data arrives at the client, we only know when we have handed it to the kernel). Do you maybe saturate some other resource such that, when varnish delivers all content simultaneously, the individual transfer times increase, but overall throughput as well? Notice how in the waterfall the transfers for the "miss" case are staggered, but all start at about the same time for the "hit" case.
$ /tmp/bin/varnishlog -r ~/Downloads/tests\(1\).log -q 'VCL_call ~ MISS' -I Timestamp:Resp| grep Time
- Timestamp Resp: 1706546727.029209 0.352031 0.001837
- Timestamp Resp: 1706546727.098763 0.422016 0.000992
- Timestamp Resp: 1706546728.001734 1.307962 0.000963
- Timestamp Resp: 1706546728.827826 2.120532 0.001338
- Timestamp Resp: 1706546728.979831 2.272708 0.000684
- Timestamp Resp: 1706546729.070863 2.212195 0.001068
- Timestamp Resp: 1706546729.301213 2.442978 0.001792
- Timestamp Resp: 1706546729.301630 2.579351 0.002100
- Timestamp Resp: 1706546729.409134 2.486631 0.001132
- Timestamp Resp: 1706546729.929259 3.235501 0.001075
- Timestamp Resp: 1706546730.181512 3.291490 0.001233
- Timestamp Resp: 1706546730.360417 3.642044 0.001832
- Timestamp Resp: 1706546730.502692 3.746410 0.002780
- Timestamp Resp: 1706546730.614502 3.787374 0.001839
- Timestamp Resp: 1706546730.861007 4.103380 0.001615
- Timestamp Resp: 1706546731.047794 4.302196 0.001754
- Timestamp Resp: 1706546731.527647 4.790143 0.001948
- Timestamp Resp: 1706546731.624235 4.759561 0.001482
- Timestamp Resp: 1706546731.764854 5.002799 0.000073
- Timestamp Resp: 1706546731.780470 5.002835 0.000200
- Timestamp Resp: 1706546731.780559 5.002260 0.000195
- Timestamp Resp: 1706546731.783999 5.002704 0.000153
- Timestamp Resp: 1706546731.829995 5.030261 0.000237
- Timestamp Resp: 1706546731.830311 5.030131 0.000197
- Timestamp Resp: 1706546731.830513 5.013749 0.000242
- Timestamp Resp: 1706546731.840052 4.954164 0.001912
- Timestamp Resp: 1706546731.843164 5.002504 0.000052
- Timestamp Resp: 1706546731.844442 5.002824 0.000061
- Timestamp Resp: 1706546731.872344 5.002712 0.000107
- Timestamp Resp: 1706546731.880732 5.002868 0.000131
- Timestamp Resp: 1706546731.905851 5.002376 0.000117
- Timestamp Resp: 1706546731.925591 5.003290 0.000145
- Timestamp Resp: 1706546738.242944 0.742073 0.002865
- Timestamp Resp: 1706546740.412813 2.851692 0.002231
- Timestamp Resp: 1706546740.527030 2.992088 0.003202
- Timestamp Resp: 1706546740.952131 3.355464 0.002205
- Timestamp Resp: 1706546741.019898 3.490349 0.002326
- Timestamp Resp: 1706546741.186267 3.603022 0.002875
- Timestamp Resp: 1706546741.218674 3.668178 0.003754
- Timestamp Resp: 1706546741.432978 3.827275 0.001501
- Timestamp Resp: 1706546741.518722 3.790050 0.002752
- Timestamp Resp: 1706546741.588033 3.981912 0.004020
- Timestamp Resp: 1706546741.787824 4.253407 0.047221
- Timestamp Resp: 1706546741.890083 4.179403 0.044593
- Timestamp Resp: 1706546741.921302 4.342907 0.003407
/tmp/bin/varnishlog -r ~/Downloads/tests\(1\).log -q 'VCL_call ~ HIT' -I Timestamp:Resp| grep Time
- Timestamp Resp: 1706546737.379089 0.000432 0.000174
- Timestamp Resp: 1706546737.414126 0.000527 0.000298
- Timestamp Resp: 1706546737.414613 0.000378 0.000177
- Timestamp Resp: 1706546737.420638 0.000276 0.000121
- Timestamp Resp: 1706546737.428999 0.000269 0.000117
- Timestamp Resp: 1706546737.438561 0.000443 0.000206
- Timestamp Resp: 1706546737.472861 0.000544 0.000342
- Timestamp Resp: 1706546737.472894 0.001119 0.000910
- Timestamp Resp: 1706546737.474015 0.001805 0.001439
- Timestamp Resp: 1706546737.485190 0.000759 0.000595
- Timestamp Resp: 1706546737.503804 0.000931 0.000685
- Timestamp Resp: 1706546737.508663 0.000247 0.000151
- Timestamp Resp: 1706546737.594484 0.001055 0.000847
- Timestamp Resp: 1706546737.608003 0.001808 0.001628
- Timestamp Resp: 1706546737.641944 0.000428 0.000249
- Timestamp Resp: 1706546737.649923 0.000443 0.000248
- Timestamp Resp: 1706546737.658738 0.000960 0.000771
- Timestamp Resp: 1706546737.662729 0.000358 0.000198
- Timestamp Resp: 1706546737.711666 0.000444 0.000258
@justinhippo , can you share the resolution details if any, please? I'm sure it'll be useful to others
The resolution here is that it is some unrelated issue with some other aspect of my stack, unrelated to vagrant, but exposed through the fast concurrent responses from vagrant