From d0c0611a3796374085446561712c5b7af3407192 Mon Sep 17 00:00:00 2001 From: Gunter Schmidt Date: Sun, 1 Mar 2026 22:37:53 +0100 Subject: [PATCH 1/3] feat: u64 for --bytes and --ignore-initial fix: bumped up tempfile to "3.26.0" The variables for --bytes, --ignore-initial and line count where size 'usize', thus limiting the readable bytes on 32-bit systems. GNU cmp is compiled with LFS (Large File Support) and allows i64 values. This is now all u64, which works also on 32-bit systems with Rust. There is no reason to implement a 32-bit barrier for 32 bit machines. Additionally the --bytes limit can be set to 'u128' using the feature "cmp_bytes_limit_128_bit". The performance impact would be negligible, as there only few calculations each time a full block is read from the file. --- Cargo.toml | 8 +++++- src/cmp.rs | 77 ++++++++++++++++++++++++++++++++---------------------- 2 files changed, 53 insertions(+), 32 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 29331c3..ad88f0c 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -26,7 +26,7 @@ unicode-width = "0.2.0" pretty_assertions = "1.4.0" assert_cmd = "2.0.14" predicates = "3.1.0" -tempfile = "3.10.1" +tempfile = "3.26.0" [profile.release] lto = "thin" @@ -40,3 +40,9 @@ panic = "abort" [profile.dist] inherits = "release" lto = "thin" + +[features] +# default = ["cmp_bytes_limit_128_bit"] +# allows to set the --bytes limit from u64 to u128, if limits larger than Exabyte are required. +cmp_bytes_limit_128_bit = [] + diff --git a/src/cmp.rs b/src/cmp.rs index 1db866b..3ec39b0 100644 --- a/src/cmp.rs +++ b/src/cmp.rs @@ -20,15 +20,23 @@ use std::os::unix::fs::MetadataExt; #[cfg(target_os = "windows")] use std::os::windows::fs::MetadataExt; +/// for --bytes, so really large number limits can be expressed, like 1Y. +#[cfg(not(feature = "cmp_bytes_limit_128_bit"))] +pub type Bytes = u64; +#[cfg(feature = "cmp_bytes_limit_128_bit")] +pub type Bytes = u128; +// ignore initial is currently limited to u64, as take(skip) is used. +pub type IgnInit = u64; + #[derive(Clone, Debug, Default, Eq, PartialEq)] pub struct Params { executable: OsString, from: OsString, to: OsString, print_bytes: bool, - skip_a: Option, - skip_b: Option, - max_bytes: Option, + skip_a: Option, + skip_b: Option, + max_bytes: Option, verbose: bool, quiet: bool, } @@ -66,13 +74,13 @@ pub fn parse_params>(mut opts: Peekable) -> Resu }; let executable_str = executable.to_string_lossy().to_string(); - let parse_skip = |param: &str, skip_desc: &str| -> Result { + let parse_skip = |param: &str, skip_desc: &str| -> Result { let suffix_start = param .find(|b: char| !b.is_ascii_digit()) .unwrap_or(param.len()); - let mut num = match param[..suffix_start].parse::() { + let mut num = match param[..suffix_start].parse::() { Ok(num) => num, - Err(e) if *e.kind() == std::num::IntErrorKind::PosOverflow => usize::MAX, + Err(e) if *e.kind() == std::num::IntErrorKind::PosOverflow => IgnInit::MAX, Err(_) => { return Err(format!( "{executable_str}: invalid --ignore-initial value '{skip_desc}'" @@ -83,7 +91,7 @@ pub fn parse_params>(mut opts: Peekable) -> Resu if suffix_start != param.len() { // Note that GNU cmp advertises supporting up to Y, but fails if you try // to actually use anything beyond E. - let multiplier: usize = match ¶m[suffix_start..] { + let multiplier: IgnInit = match ¶m[suffix_start..] { "kB" => 1_000, "K" => 1_024, "MB" => 1_000_000, @@ -106,10 +114,11 @@ pub fn parse_params>(mut opts: Peekable) -> Resu #[cfg(not(target_pointer_width = "64"))] usize::MAX } - "ZB" => usize::MAX, // 1_000_000_000_000_000_000_000, - "Z" => usize::MAX, // 1_180_591_620_717_411_303_424, - "YB" => usize::MAX, // 1_000_000_000_000_000_000_000_000, - "Y" => usize::MAX, // 1_208_925_819_614_629_174_706_176, + // TODO setting usize:MAX does not mimic GNU cmp behavior, it should be an error. + "ZB" => IgnInit::MAX, // 1_000_000_000_000_000_000_000, + "Z" => IgnInit::MAX, // 1_180_591_620_717_411_303_424, + "YB" => IgnInit::MAX, // 1_000_000_000_000_000_000_000_000, + "Y" => IgnInit::MAX, // 1_208_925_819_614_629_174_706_176, _ => { return Err(format!( "{executable_str}: invalid --ignore-initial value '{skip_desc}'" @@ -119,7 +128,7 @@ pub fn parse_params>(mut opts: Peekable) -> Resu num = match num.overflowing_mul(multiplier) { (n, false) => n, - _ => usize::MAX, + _ => IgnInit::MAX, } } @@ -173,9 +182,10 @@ pub fn parse_params>(mut opts: Peekable) -> Resu let (_, arg) = param_str.split_once('=').unwrap(); arg.to_string() }; - let max_bytes = match max_bytes.parse::() { + let max_bytes = match max_bytes.parse::() { Ok(num) => num, - Err(e) if *e.kind() == std::num::IntErrorKind::PosOverflow => usize::MAX, + // TODO limit to MAX is dangerous, this should become an error like in GNU cmp. + Err(e) if *e.kind() == std::num::IntErrorKind::PosOverflow => Bytes::MAX, Err(_) => { return Err(format!( "{executable_str}: invalid --bytes value '{max_bytes}'" @@ -233,7 +243,7 @@ pub fn parse_params>(mut opts: Peekable) -> Resu } // Do as GNU cmp, and completely disable printing if we are - // outputing to /dev/null. + // outputting to /dev/null. #[cfg(not(target_os = "windows"))] if is_stdout_dev_null() { params.quiet = true; @@ -285,7 +295,7 @@ pub fn parse_params>(mut opts: Peekable) -> Resu fn prepare_reader( path: &OsString, - skip: &Option, + skip: &Option, params: &Params, ) -> Result, String> { let mut reader: Box = if path == "-" { @@ -326,7 +336,7 @@ pub fn cmp(params: &Params) -> Result { let mut from = prepare_reader(¶ms.from, ¶ms.skip_a, params)?; let mut to = prepare_reader(¶ms.to, ¶ms.skip_b, params)?; - let mut offset_width = params.max_bytes.unwrap_or(usize::MAX); + let mut offset_width = params.max_bytes.unwrap_or(Bytes::MAX); if let (Ok(a_meta), Ok(b_meta)) = (fs::metadata(¶ms.from), fs::metadata(¶ms.to)) { #[cfg(not(target_os = "windows"))] @@ -341,7 +351,7 @@ pub fn cmp(params: &Params) -> Result { return Ok(Cmp::Different); } - let smaller = cmp::min(a_size, b_size) as usize; + let smaller = cmp::min(a_size, b_size) as Bytes; offset_width = cmp::min(smaller, offset_width); } @@ -350,8 +360,8 @@ pub fn cmp(params: &Params) -> Result { // Capacity calc: at_byte width + 2 x 3-byte octal numbers + 2 x 4-byte value + 4 spaces let mut output = Vec::::with_capacity(offset_width + 3 * 2 + 4 * 2 + 4); - let mut at_byte = 1; - let mut at_line = 1; + let mut at_byte: Bytes = 1; + let mut at_line: u64 = 1; let mut start_of_line = true; let mut stdout = BufWriter::new(io::stdout().lock()); let mut compare = Cmp::Equal; @@ -401,8 +411,8 @@ pub fn cmp(params: &Params) -> Result { if from_buf[..consumed] == to_buf[..consumed] { let last = from_buf[..consumed].last().unwrap(); - at_byte += consumed; - at_line += from_buf[..consumed].iter().filter(|&c| *c == b'\n').count(); + at_byte += consumed as Bytes; + at_line += (from_buf[..consumed].iter().filter(|&c| *c == b'\n').count()) as u64; start_of_line = *last == b'\n'; @@ -590,7 +600,7 @@ fn format_visible_byte(byte: u8) -> String { fn format_verbose_difference( from_byte: u8, to_byte: u8, - at_byte: usize, + at_byte: Bytes, offset_width: usize, output: &mut Vec, params: &Params, @@ -655,7 +665,7 @@ fn format_verbose_difference( } #[inline] -fn report_eof(at_byte: usize, at_line: usize, start_of_line: bool, eof_on: &str, params: &Params) { +fn report_eof(at_byte: Bytes, at_line: u64, start_of_line: bool, eof_on: &str, params: &Params) { if params.quiet { return; } @@ -707,7 +717,7 @@ fn is_posix_locale() -> bool { } #[inline] -fn report_difference(from_byte: u8, to_byte: u8, at_byte: usize, at_line: usize, params: &Params) { +fn report_difference(from_byte: u8, to_byte: u8, at_byte: Bytes, at_line: u64, params: &Params) { if params.quiet { return; } @@ -804,7 +814,7 @@ mod tests { from: os("foo"), to: os("bar"), skip_a: Some(1), - skip_b: Some(usize::MAX), + skip_b: Some(IgnInit::MAX), ..Default::default() }), parse_params( @@ -982,7 +992,7 @@ mod tests { executable: os("cmp"), from: os("foo"), to: os("bar"), - max_bytes: Some(usize::MAX), + max_bytes: Some(Bytes::MAX), ..Default::default() }), parse_params( @@ -999,6 +1009,7 @@ mod tests { ); // Failure case + // TODO This is actually fine in GNU cmp. --bytes does not have a unit parser yet. assert_eq!( Err("cmp: invalid --bytes value '1K'".to_string()), parse_params( @@ -1044,8 +1055,8 @@ mod tests { executable: os("cmp"), from: os("foo"), to: os("bar"), - skip_a: Some(usize::MAX), - skip_b: Some(usize::MAX), + skip_a: Some(IgnInit::MAX), + skip_b: Some(IgnInit::MAX), ..Default::default() }), parse_params( @@ -1116,8 +1127,12 @@ mod tests { .enumerate() { let values = [ - 1_000usize.checked_pow((i + 1) as u32).unwrap_or(usize::MAX), - 1024usize.checked_pow((i + 1) as u32).unwrap_or(usize::MAX), + (1_000 as IgnInit) + .checked_pow((i + 1) as u32) + .unwrap_or(IgnInit::MAX), + (1024 as IgnInit) + .checked_pow((i + 1) as u32) + .unwrap_or(IgnInit::MAX), ]; for (j, v) in values.iter().enumerate() { assert_eq!( From 884edb2a501dc2ebc766e9922815c75dcf955259 Mon Sep 17 00:00:00 2001 From: Gunter Schmidt Date: Sun, 1 Mar 2026 23:15:23 +0100 Subject: [PATCH 2/3] fix: removed 32-bit conditional compiling --- src/cmp.rs | 22 ++++++---------------- 1 file changed, 6 insertions(+), 16 deletions(-) diff --git a/src/cmp.rs b/src/cmp.rs index 3ec39b0..923f51f 100644 --- a/src/cmp.rs +++ b/src/cmp.rs @@ -98,22 +98,12 @@ pub fn parse_params>(mut opts: Peekable) -> Resu "M" => 1_048_576, "GB" => 1_000_000_000, "G" => 1_073_741_824, - // This only generates a warning when compiling for target_pointer_width < 64 - #[allow(unused_variables)] - suffix @ ("TB" | "T" | "PB" | "P" | "EB" | "E") => { - #[cfg(target_pointer_width = "64")] - match suffix { - "TB" => 1_000_000_000_000, - "T" => 1_099_511_627_776, - "PB" => 1_000_000_000_000_000, - "P" => 1_125_899_906_842_624, - "EB" => 1_000_000_000_000_000_000, - "E" => 1_152_921_504_606_846_976, - _ => unreachable!(), - } - #[cfg(not(target_pointer_width = "64"))] - usize::MAX - } + "TB" => 1_000_000_000_000, + "T" => 1_099_511_627_776, + "PB" => 1_000_000_000_000_000, + "P" => 1_125_899_906_842_624, + "EB" => 1_000_000_000_000_000_000, + "E" => 1_152_921_504_606_846_976, // TODO setting usize:MAX does not mimic GNU cmp behavior, it should be an error. "ZB" => IgnInit::MAX, // 1_000_000_000_000_000_000_000, "Z" => IgnInit::MAX, // 1_180_591_620_717_411_303_424, From 2e9d3954438b0bea21f605e88adc7273da48cdc8 Mon Sep 17 00:00:00 2001 From: Gunter Schmidt Date: Mon, 2 Mar 2026 15:31:19 +0100 Subject: [PATCH 3/3] fix: clippy warning --- src/cmp.rs | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/cmp.rs b/src/cmp.rs index 923f51f..442eb02 100644 --- a/src/cmp.rs +++ b/src/cmp.rs @@ -285,7 +285,7 @@ pub fn parse_params>(mut opts: Peekable) -> Resu fn prepare_reader( path: &OsString, - skip: &Option, + skip: &Option, params: &Params, ) -> Result, String> { let mut reader: Box = if path == "-" { @@ -304,6 +304,8 @@ fn prepare_reader( }; if let Some(skip) = skip { + // cast as u64 must remain, because value of IgnInit data type could be changed. + #[allow(clippy::unnecessary_cast)] if let Err(e) = io::copy(&mut reader.by_ref().take(*skip as u64), &mut io::sink()) { return Err(format_failure_to_read_input_file( ¶ms.executable,