c3c icon indicating copy to clipboard operation
c3c copied to clipboard

[StdLib] Add C3-native `sscanf`

Open BWindey opened this issue 5 months ago • 2 comments

Libc provides the sscanf (and family) to parse a string from a format to some variables. Basically the exact opposite of printf. As we have the C3 version of printf, it would also be great to have a C3 version of sscanf.

Example:

fn void main() {
	String seq = "\e[97;5u";
	int codepoint;
	int modifier;
	if (string::sscanf(seq, "\e[%d;%du", &codepoint, &modifier) == 2) {
		io::printfn("Parsed codepoint %d and modifier %d", codepoint, modifier);
	} else {
		io::eprintn("Failed to parse string");
	}
}

BWindey avatar Jun 26 '25 15:06 BWindey

Scanf is such a bad function in C though. I don't see people copying it to be honest.

lerno avatar Jun 27 '25 13:06 lerno

What do you mean with "bad function" and "don't see people copying it"?

BWindey avatar Jun 27 '25 18:06 BWindey

@BWindey there are some security issues for example, among others https://sternumiot.com/iot-blog/scanf-c-function-syntax-examples-and-security-best-practices/

joshring avatar Jul 01 '25 08:07 joshring

Seems like I'll be writing my own kind of sscanf purpose-built for my own good.

The problems with the C implementation could be resolved in a C3-native implementation though. It's up to you whether this should be added to the standard library or not. (The libc-binding is already present.)

BWindey avatar Jul 01 '25 08:07 BWindey

Ok, worked on an implementation today. It's not yet fully tested, documentation is lacking, and only supports base 10 for numbers, but it's a start. Oh, and code is horrible XD

But as proof of concept this counts!

The output of below code is:

Original: Hi nr. -35.13, you're a very gentle person. Is that true?
Format:   Hi nr. %f, you're a %s person. Is that %b?

Parsed nr = -35.130000, trait = very gentle, question = 1
import std::io;

import std::math;

faultdef
	FORMAT_NO_MATCH,
	UNKNOWN_TYPE_SPECIFIER,
	ARG_DOES_NOT_MATCH_TYPE_SPECIFIER,
	NOT_ENOUGH_ARGS_FOR_FORMAT,
	MALFORMED_INTEGER,
	NO_STRING_END,
	INVALID_BOOLEAN,
	DID_NOT_CONSUME_FULL_FORMAT,
	DID_NOT_CONSUME_FULL_STRING,
	MALFORMED_FLOAT;

fn bool any_is_intlike(any a) {
	return a.type.kindof == SIGNED_INT || a.type.kindof == UNSIGNED_INT;
}

fn bool any_is_string_like(any a) {
	return a.type == String.typeid
		|| (a.type.kindof == SLICE && a.type.inner == char.typeid);
}

fn bool any_is_bool_like(any a) {
	return a.type == bool.typeid;
}

fn bool any_is_float_like(any a) {
	return a.type.kindof == FLOAT;
}

macro void multiply_to_any(any a, mult) {
	switch (a.type) {
		case char:
			*(char*) a.ptr *= (char) mult;
		case ichar:
			*(ichar*) a.ptr *= (ichar) mult;
		case short:
			*(short*) a.ptr *= (short) mult;
		case ushort:
			*(ushort*) a.ptr *= (ushort) mult;
		case int:
			*(int*) a.ptr *= (int) mult;
		case uint:
			*(uint*) a.ptr *= (uint) mult;
		case long:
			*(long*) a.ptr *= (long) mult;
		case ulong:
			*(ulong*) a.ptr *= (ulong) mult;
		case int128:
			*(int128*) a.ptr *= (int128) mult;
		case uint128:
			*(uint128*) a.ptr *= (uint128) mult;

		case float:
			*(float*) a.ptr *= (float) mult;
		case double:
			*(double*) a.ptr *= (double) mult;

		default:
			break;
	}
}

macro void add_to_any(any a, add) {
	switch (a.type) {
		case char:
			*(char*) a.ptr += (char) add;
		case ichar:
			*(ichar*) a.ptr += (ichar) add;
		case short:
			*(short*) a.ptr += (short) add;
		case ushort:
			*(ushort*) a.ptr += (ushort) add;
		case int:
			*(int*) a.ptr += (int) add;
		case uint:
			*(uint*) a.ptr += (uint) add;
		case long:
			*(long*) a.ptr += (long) add;
		case ulong:
			*(ulong*) a.ptr += (ulong) add;
		case int128:
			*(int128*) a.ptr += (int128) add;
		case uint128:
			*(uint128*) a.ptr += (uint128) add;

		case float:
			*(float*) a.ptr += (float) add;
		case double:
			*(double*) a.ptr += (double) add;

		default:
			break;
	}
}

fn void negate_any(any a) {
	switch (a.type) {
		case char:
			*(char*) a.ptr = -(*(char*) a.ptr);
		case ichar:
			*(ichar*) a.ptr = -(*(ichar*) a.ptr);
		case short:
			*(short*) a.ptr = -(*(short*) a.ptr);
		case ushort:
			*(ushort*) a.ptr = -(*(ushort*) a.ptr);
		case int:
			*(int*) a.ptr = -(*(int*) a.ptr);
		case uint:
			*(uint*) a.ptr = -(*(uint*) a.ptr);
		case long:
			*(long*) a.ptr = -(*(long*) a.ptr);
		case ulong:
			*(ulong*) a.ptr = -(*(ulong*) a.ptr);
		case int128:
			*(int128*) a.ptr = -(*(int128*) a.ptr);
		case uint128:
			*(uint128*) a.ptr = -(*(uint128*) a.ptr);

		case float:
			*(float*) a.ptr = -(*(float*) a.ptr);
		case double:
			*(double*) a.ptr = -(*(double*) a.ptr);

		default:
			break;
	}
}

fn int? String.parse_format(String self, String fmt, any ...args) {
	usz fmt_idx = 0;
	usz slf_idx = 0;
	usz arg_idx = 0;

	while (fmt_idx < fmt.len && slf_idx < self.len) {
		if (fmt[fmt_idx] == '%') {
			if (fmt_idx + 1 < fmt.len && fmt[fmt_idx + 1] != '%' && arg_idx >= args.len) {
				return NOT_ENOUGH_ARGS_FOR_FORMAT?;
			}
			fmt_idx++;

			any arg;
			// This check is needed for supporting "%%"
			if (arg_idx < args.len) {
				arg = args[arg_idx];
			}

			switch (fmt[fmt_idx]) {
				case '%':
					// "%%" is interpreted as literal '%'
					if (self[slf_idx] != '%') {
						return FORMAT_NO_MATCH?;
					}
					slf_idx++;
					fmt_idx++;

				case 'd':
					if (!any_is_intlike(arg)) {
						return ARG_DOES_NOT_MATCH_TYPE_SPECIFIER?;
					}
					mem::set(arg.ptr, 0, arg.type.sizeof);
					bool is_negative = false;
					switch (self[slf_idx]) {
						case '-':
							is_negative = true;
							nextcase;
						case '+':
							slf_idx++;
							if (slf_idx >= self.len) {
								return MALFORMED_INTEGER?;
							}

						default: break;
					}

					while (slf_idx < self.len && self[slf_idx] >= '0' && self[slf_idx] <= '9') {
						multiply_to_any(arg, 10);
						add_to_any(arg, self[slf_idx] - '0');
						slf_idx++;
					}
					if (is_negative) {
						negate_any(arg);
					}

					fmt_idx++;
					arg_idx++;

				case 's':
					if (!any_is_string_like(arg)) {
						return ARG_DOES_NOT_MATCH_TYPE_SPECIFIER?;
					}
					// Disallow "%s%x" (where 'x' != '%').
					if (fmt_idx + 2 < fmt.len && fmt[fmt_idx + 1] == '%' && fmt[fmt_idx + 2] == '%') {
						return NO_STRING_END?;
					}
					usz start_slice = slf_idx;

					// Determine part after '%s' in format-string that needs
					// to be matched in `self` before ending the string
					usz match_start = fmt_idx + 1;
					usz match_stop = fmt_idx + 1;

					while (match_stop < fmt.len && match_stop != '%') {
						match_stop++;
					}
					String match = fmt[match_start..match_stop - 1];

					while (
						slf_idx < self.len
						&& (fmt_idx + 1 >= fmt.len || self[slf_idx:match.len] != match)
					) {
						slf_idx++;
					}
					*(String*) arg.ptr = self[start_slice..slf_idx-1];

					fmt_idx++;
					arg_idx++;

				case 'b':
					if (!any_is_bool_like(arg)) {
						return ARG_DOES_NOT_MATCH_TYPE_SPECIFIER?;
					}

					switch (self[slf_idx]) {
						case 'y':
						case '1':
							*(bool*) arg.ptr = true;
							slf_idx++;
						case 'n':
						case '0':
							*(bool*) arg.ptr = false;
							slf_idx++;
						case 't':
							if (
								slf_idx + "true".len >= self.len
								|| self[slf_idx:"true".len] != "true"
							) {
								return INVALID_BOOLEAN?;
							}
							*(bool*) arg.ptr = true;
							slf_idx += "true".len;
						case 'f':
							if (
								slf_idx + "false".len >= self.len
								|| self[slf_idx:"false".len] != "false"
							) {
								return INVALID_BOOLEAN?;
							}
							*(bool*) arg.ptr = false;
							slf_idx += "false".len;
						default:
							return INVALID_BOOLEAN?;
					}

					fmt_idx++;
					arg_idx++;

				case 'f':
					if (!any_is_float_like(arg)) {
						return ARG_DOES_NOT_MATCH_TYPE_SPECIFIER?;
					}

					mem::set(arg.ptr, 0, arg.type.sizeof);
					bool is_negative = false;

					switch (self[slf_idx]) {
						case '-':
							is_negative = true;
							nextcase;
						case '+':
							slf_idx++;
							if (slf_idx >= self.len) {
								return MALFORMED_FLOAT?;
							}
						default: break;
					}

					ulong whole_part = 0;

					// Whole part
					while (slf_idx < self.len && self[slf_idx] >= '0' && self[slf_idx] <= '9') {
						whole_part *= 10;
						whole_part += (ulong) (self[slf_idx] - '0');
						slf_idx++;
					}

					add_to_any(arg, whole_part);

					if (slf_idx >= self.len) {
						if (is_negative) {
							negate_any(arg);
						}
						arg_idx++;
						break;
					}

					bool jump_fraction_part = false;

					switch (self[slf_idx]) {
						case 'e':
						case 'E':
							jump_fraction_part = true;
							nextcase;
						case '.':
							slf_idx++;
							if (slf_idx >= self.len) {
								return MALFORMED_FLOAT?;
							}
						default:
							return MALFORMED_FLOAT?;
					}

					if (!jump_fraction_part) {
						ulong fraction_part = 0;
						ushort amount_digits = 0;
						while (
							slf_idx < self.len
							&& self[slf_idx] >= '0'
							&& self[slf_idx] <= '9'
						) {
							fraction_part *= 10;
							fraction_part += (ulong) (self[slf_idx] - '0');
							slf_idx++;
							amount_digits++;
						}
						add_to_any(
							arg,
							(double) fraction_part * math::pow(10, -amount_digits)
						);
						if (slf_idx >= self.len) {
							if (is_negative) {
								negate_any(arg);
							}
							arg_idx++;
							break;
						}
					}

					if (self[slf_idx] == 'e' || self[slf_idx] == 'E') {
						slf_idx++;
						if (slf_idx >= self.len) {
							return MALFORMED_FLOAT?;
						}

						bool exp_is_negative = false;
						int exponent = 0;

						switch (self[slf_idx]) {
							case '-':
								exp_is_negative = true;
								nextcase;
							case '+':
								slf_idx++;
								if (slf_idx >= self.len) {
									return MALFORMED_FLOAT?;
								}
							default: break;
						}

						while (slf_idx < self.len && self[slf_idx] >= '0' && self[slf_idx] <= '9') {
							exponent *= 10;
							exponent += (int) (self[slf_idx] - '0');
							slf_idx++;
						}

						if (exp_is_negative) {
							exponent = -exponent;
						}

						multiply_to_any(arg, math::pow(10, exponent));
					}

					if (is_negative) {
						negate_any(arg);
					}

					fmt_idx++;
					arg_idx++;

				default:
					return UNKNOWN_TYPE_SPECIFIER?;
			}

		} else if (fmt[fmt_idx] != self[slf_idx]) {
			return FORMAT_NO_MATCH?;
		} else {
			fmt_idx++;
			slf_idx++;
		}
	}

	if (fmt_idx != fmt.len) {
		return DID_NOT_CONSUME_FULL_FORMAT?;
	} else if (slf_idx != self.len) {
		return DID_NOT_CONSUME_FULL_STRING?;
	}
	return 0;
}

fn void main() {
	String to_parse = "Hi nr. -35.13, you're a very gentle person. Is that true?";
	String format = "Hi nr. %f, you're a %s person. Is that %b?";

	double nr;
	String trait;
	bool question;

	if (catch error = to_parse.parse_format(format, &nr, &trait, &question)) {
		io::printfn("Failed to parse string: %s", error);
	} else {
		io::printfn("Original: %s", to_parse);
		io::printfn("Format:   %s\n", format);
		io::printfn("Parsed nr = %f, trait = %s, question = %b", nr, trait, question);
	}
}

BWindey avatar Jul 01 '25 22:07 BWindey

You can have a look at the formatter how it handles any

lerno avatar Jul 01 '25 23:07 lerno