rust-csv
rust-csv copied to clipboard
`Position::line` reports incorrect line when input has CRLF line endings
Given reader: csv::Reader, when reader.read_record returns, reader.position().line() should return the line number where the next record begins.
When the input has a single-byte record separator like \n, that's exactly what it does. However, when the input uses \r\n line endings, the last record's line number is reported instead.
This test demonstrates the issue:
#[test]
fn test_crlf_line_numbers() {
let input = std::io::Cursor::new(b"foo,bar\r\none,two\r\n");
let mut reader =
csv::ReaderBuilder::new()
.terminator(csv::Terminator::CRLF)
.has_headers(false)
.from_reader(input);
assert_eq!(reader.position().line(), 1);
reader.read_record(&mut csv::StringRecord::new()).unwrap();
assert_eq!(reader.position().line(), 2);
}
This test fails at the final assert_eq! because reader.position().line() still returns 1. If, however, the \rs are removed from the input, this test passes.
The problem seems to be that Reader::read_record consumes only the first byte of the terminator, but line is only incremented here when the parser consumes a \n byte. If the terminator is \r\n, then it only consumes \n and increments the line number when it starts parsing the next record.
For example, if you add the following test to the tests module in csv-core/src/reader.rs:
#[test]
fn stream_record_with_crlf() {
use crate::ReadRecordResult::*;
let mut inp = b("foo,bar\r\nbaz,\r\n3,4\r\n");
let out = &mut [0; 1024];
let ends = &mut [0; 10];
let mut rdr = ReaderBuilder::new().terminator(Terminator::CRLF).build();
assert_eq!(rdr.line(), 1);
assert_read_record!(rdr, &inp, out, ends, 9, 6, 2, Record);
assert_eq!(ends[0], 3);
assert_eq!(&out[0..3], b"foo");
assert_eq!(ends[1], 6);
assert_eq!(&out[3..6], b"bar");
assert_eq!(rdr.line(), 1);
inp = &inp[9..];
assert_read_record!(rdr, &inp, out, ends, 6, 3, 1, Record);
assert_eq!(ends[0], 3);
assert_eq!(&out[0..3], b"baz");
assert_eq!(rdr.line(), 2);
inp = &inp[6..];
assert_read_record!(rdr, &inp, out, ends, 5, 2, 2, Record);
assert_eq!(ends[0], 1);
assert_eq!(&out[0..1], b"3");
assert_eq!(ends[1], 2);
assert_eq!(&out[1..2], b"4");
assert_eq!(rdr.line(), 3);
inp = &inp[5..];
assert_read_record!(rdr, &inp, out, ends, 0, 0, 0, End);
}
The test will fail at the first assert_read_record! because the parser has only consumed 8 bytes, up to the first \r. For it to consume a \n and increment the line number, it would need to have consumed 9 bytes there.
When csv::Reader reads into some record: csv::StringRecord, then record.position().unwrap().line() is also incorrect if CRLF line endings are used.
As a workaround, line numbers are reported correctly if carriage returns are stripped out of the input before feeding it to the CSV parser. Here's a simple reader adapter that does this:
Click to show code
use std::io::{self, Read};
pub struct CrFilteringReader<R> {
reader: R,
}
impl<R> CrFilteringReader<R>
where
R: Read,
{
pub fn new(reader: R) -> Self {
Self { reader }
}
}
impl<R> Read for CrFilteringReader<R>
where
R: Read,
{
fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> {
loop {
// Read some bytes. Bail if that fails.
let bytes_read: usize = self.reader.read(buf)?;
if bytes_read == 0 {
return Ok(0);
}
// Look for CRs and overwrite them with the bytes after them.
let mut cursor: usize = 0;
let mut move_back: usize = 0;
let mut bytes_read_after_move: usize = bytes_read;
while cursor < bytes_read {
if buf[cursor] == b'\r' {
move_back += 1;
bytes_read_after_move -= 1;
}
else if move_back != 0 {
buf[cursor - move_back] = buf[cursor];
}
cursor += 1;
}
// If that emptied out the entire buffer, then try again.
if bytes_read_after_move == 0 {
continue;
}
// Done.
break Ok(bytes_read_after_move);
}
}
}
#[test]
fn test_basic() {
let input = b"\r\nfoo\r\r\nbar\nbaz\n\r";
let mut buf = Vec::with_capacity(input.len());
CrFilteringReader::new(io::Cursor::new(input)).read_to_end(&mut buf).unwrap();
assert_eq!(&*buf, b"\nfoo\nbar\nbaz\n");
}
#[test]
fn test_all_cr() {
let input = b"\r\r\r\r\r";
let mut buf = Vec::with_capacity(input.len());
CrFilteringReader::new(io::Cursor::new(input)).read_to_end(&mut buf).unwrap();
assert_eq!(&*buf, b"");
}
I have just observed the same issue (files with LF work, CRLF does not), is there any PR or other efforts to fix this?