password-hashes
password-hashes copied to clipboard
argon2: optimized implementation
The current argon2
crate implementation is a translation of ref.c
from the reference implementation:
https://github.com/P-H-C/phc-winner-argon2/blob/92cd2e1/src/ref.c
It could be improved by translating opt.c
instead, which provides e.g. SIMD support:
https://github.com/P-H-C/phc-winner-argon2/blob/92cd2e1/src/opt.c
👋 Hey there, would it be possible to assign this issue to me? I'd like to implement it as part of a work project and have some allocated time.
We're expecting that this should provide a fairly significant speedup to x86 users who have system made in the last 10 years. My preliminary benchmarking against the C library shows that there's a 40% improvement in speed with 64MiB, and matches the memory size in "helpfulness" as it goes down.
I don't plan to port the AVX512 implementation right away since I don't have a way to test it and, IMO, its of limited usefulness in the correct scenario where passwords are hashed on the client since most consumer CPUs don't support it anyway.
@complexspaces sounds great! Even the AVX2 implementation would be fantastic.
FWIW we use the cpufeatures
crate elsewhere to autodetect AVX2 and use it when available.
Given that #440 has been merged, is this still an issue? If so, what needs to be done to address it?
#440 is a nice stopgap using auto-vectorization, but opt.c
contains a natively AVX2/AVX512-optimized implementation it would be nice to eventually port over
As an update on #408, the current argon2 AVX2 implementation is now only 2-3x slower than the Go Implementation even for high Parallelism
costs, and (as expected) beats the Go implementation when Parallelism = 1
. Tested on an AMD Ryzen 7 5700G, best of 3 runs.
With Memory = 128 MiB
, Time = 64
Parallelism | Rust Time | Go Time | C Time |
---|---|---|---|
1 | 3.36 s | 4.02 s | 2.62 s |
2 | 3.41 s | 2.22 s | 1.68 s |
4 | 3.45 s | 1.48 s | 1.31 s |
8 | 3.49 s | 1.24 s | 1.26 s |
16 | 3.52 s | 1.21 s | 1.24 s |
With Memory = 64 MiB
, Time = 32
Parallelism | Rust Time | Go Time | C Time |
---|---|---|---|
1 | 805 ms | 965 ms | 621 ms |
2 | 825 ms | 527 ms | 414 ms |
4 | 833 ms | 348 ms | 309 ms |
8 | 847 ms | 298 ms | 298 ms |
16 | 853 ms | 287 ms | 291 ms |
Rust Implementation
use std::time::Instant;
use argon2::{Argon2, Params};
static PASSWORD: &[u8] = "hunter2".as_bytes();
const SALT_LEN: usize = 16;
static SALT: [u8; SALT_LEN] = [0u8; SALT_LEN];
const KEY_LEN: usize = 32;
fn benchmark_keys(t_cost: u32, m_cost: u32) {
let mut key = [0u8; KEY_LEN];
for p_cost in [1, 2, 4, 8, 16u32] {
let params = Params::new(m_cost, t_cost, p_cost, None).unwrap();
let ctx = Argon2::from(params);
let t = Instant::now();
ctx.hash_password_into(PASSWORD, &SALT, &mut key).unwrap();
let dt = t.elapsed();
print!("t={} m={} p={} ({:?}): ", t_cost, m_cost, p_cost, dt);
for b in key {
print!("{:02x}", b);
}
println!()
}
}
fn main() {
benchmark_keys(64, 1 << 17);
benchmark_keys(32, 1 << 16);
}
Profile
[profile.release]
strip = "symbols"
lto = true
panic = "abort"
incremental = false
codegen-units = 1
Run
RUSTFLAGS="-C target-cpu=native" cargo run --release
Go Implementation
package main
import (
"encoding/hex"
"fmt"
"time"
"golang.org/x/crypto/argon2"
)
const (
SaltLen = 16
KeyLen = 32
)
func benchmark_keys(t_cost uint32, m_cost uint32) {
password := []byte("hunter2")
salt := [SaltLen]byte{}
for _, p_cost := range []uint8{1, 2, 4, 8, 16} {
t := time.Now()
key := argon2.IDKey(password, salt[:], t_cost, m_cost, p_cost, KeyLen)
dt := time.Since(t)
fmt.Printf("t=%d m=%d p=%d (%s): %s\n", t_cost, m_cost, p_cost, dt, hex.EncodeToString(key))
}
}
func main() {
benchmark_keys(64, 1<<17)
benchmark_keys(32, 1<<16)
}
Built with
GOAMD64=v3 go build
C Implementation
#include "argon2.h"
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <time.h>
#define SALT_LEN 16
#define KEY_LEN 32
static uint8_t SALT[SALT_LEN] = {0};
static uint8_t KEY[KEY_LEN];
static uint32_t PS[5] = {1,2,4,8,16};
static const char* PWD = "hunter2";
static struct timespec BEGIN;
static struct timespec END;
void benchmark_keys(uint32_t t_cost, uint32_t m_cost) {
uint32_t pwdlen = strlen(PWD);
for(int i = 0; i < 5; ++i) {
uint32_t p_cost = PS[i];
clock_gettime(CLOCK_MONOTONIC_RAW, &BEGIN);
int r = argon2id_hash_raw(t_cost, m_cost, p_cost, PWD, pwdlen, SALT, SALT_LEN, KEY, KEY_LEN);
clock_gettime(CLOCK_MONOTONIC_RAW, &END);
double duration = (END.tv_nsec - BEGIN.tv_nsec) / 1000000000.0 + (END.tv_sec - BEGIN.tv_sec);
if (r != 0) {
printf("Error code %d\n", r);
exit(1);
}
printf("t=%d m=%d p=%d (%fs): ", t_cost, m_cost, p_cost, duration);
for (int i=0; i<KEY_LEN; ++i) {
printf("%02x", KEY[i]);
}
printf("\n");
}
}
int main() {
benchmark_keys(64, 1 << 17);
benchmark_keys(32, 1 << 16);
return 0;
}
Built against the upstream library with -march=native
.