webgraph-rs icon indicating copy to clipboard operation
webgraph-rs copied to clipboard

LLP: Failure to serialize labels causes a segfault

Open progval opened this issue 1 year ago • 5 comments

For some reason, when we early-return from this line (eg. because we ran out of disk in the temp dir):

https://github.com/vigna/webgraph-rs/blob/9cd05365def84b8f3bc2a61587c1a68a00ad818c/src/algo/llp/mod.rs#L302

then dropping the LabelStore segfaults.

For example, with this patch:

diff --git a/src/algo/llp/mod.rs b/src/algo/llp/mod.rs
index 2b5f514..6f14043 100644
--- a/src/algo/llp/mod.rs
+++ b/src/algo/llp/mod.rs
@@ -43,6 +43,7 @@ use rand::SeedableRng;
 use rayon::prelude::*;
 use std::collections::HashMap;
 use std::env::temp_dir;
+use std::mem::ManuallyDrop;
 use std::path::PathBuf;
 use std::sync::atomic::Ordering;
 use std::sync::atomic::{AtomicBool, AtomicU64, AtomicUsize};
@@ -152,6 +153,7 @@ pub fn layered_label_propagation<R: RandomAccessGraph + Sync>(
             .iter()
             .for_each(|x| x.store(true, Ordering::Relaxed));
 
+        /*
         for update in 0.. {
             update_pl.start(format!("Starting update {}...", update));
 
@@ -270,6 +272,7 @@ pub fn layered_label_propagation<R: RandomAccessGraph + Sync>(
                 break;
             }
         }
+        */
 
         iter_pl.done();
 
@@ -295,11 +298,16 @@ pub fn layered_label_propagation<R: RandomAccessGraph + Sync>(
         costs.push(cost);
 
         // storing the perms
+        let path = labels_path(gamma_index);
+        info!("Creating {}", path.display());
         let mut file =
-            std::fs::File::create(labels_path(gamma_index)).context("Could not write labels")?;
-        labels
-            .serialize(&mut file)
-            .context("Could not serialize labels")?;
+            std::fs::File::create(&path).context("Could not write labels")?;
+        info!("Writing {}", path.display());
+        let res = labels
+            .serialize(&mut file);
+        info!("Res {:?}", res);
+        res.context("Could not serialize labels")?;
+        info!("Done writing {}", path.display());
 
         gamma_pl.update_and_display();
     }
diff --git a/src/cli/llp.rs b/src/cli/llp.rs
index 99c0151..d77a809 100644
--- a/src/cli/llp.rs
+++ b/src/cli/llp.rs
@@ -85,7 +85,7 @@ pub fn cli(command: Command) -> Command {
 pub fn main(submatches: &ArgMatches) -> Result<()> {
     let args = CliArgs::from_arg_matches(submatches)?;
 
-    match get_endianness(&args.basename)?.as_str() {
+    let main_res = match get_endianness(&args.basename)?.as_str() {
         #[cfg(any(
             feature = "be_bins",
             not(any(feature = "be_bins", feature = "le_bins"))
@@ -97,7 +97,10 @@ pub fn main(submatches: &ArgMatches) -> Result<()> {
         ))]
         LE::NAME => llp_impl::<LE>(args),
         e => panic!("Unknown endianness: {}", e),
-    }
+    };
+
+    log::info!("main res {:?}", main_res);
+    main_res
 }
 
 fn llp_impl<E: Endianness + 'static + Send + Sync>(args: CliArgs) -> Result<()>
@@ -157,7 +160,7 @@ where
     }
 
     // compute the LLP
-    let labels = llp::layered_label_propagation(
+    let res2 = llp::layered_label_propagation(
         &graph,
         &*deg_cumul,
         gammas,
@@ -166,8 +169,10 @@ where
         args.granularity,
         args.seed,
         predicate,
-    )
-    .context("Could not compute the LLP")?;
+    );
+    log::info!("res2 {:?}", res2);
+    let labels = res2.context("Could not compute the LLP")?;
+    log::info!("labels ok");
 
     let mut llp_perm = (0..graph.num_nodes()).collect::<Vec<_>>();
     llp_perm.par_sort_by(|&a, &b| labels[a].cmp(&labels[b]));

llp prints:

[2024-03-23T11:33:45Z INFO  webgraph::algo::llp] Log-gap cost: 68596432338
[2024-03-23T11:33:45Z INFO  webgraph::algo::llp] Creating /tmp/labels_0.bin
[2024-03-23T11:33:45Z INFO  webgraph::algo::llp] Writing /tmp/labels_0.bin
[2024-03-23T11:33:48Z INFO  webgraph::algo::llp] Res Err(WriteError)
Segmentation fault

and here is the traceback:

Thread 1 "webgraph" received signal SIGSEGV, Segmentation fault.                                                                        
__GI___libc_free (mem=0x7ffb32dfd010) at malloc.c:3102                                                                                  
3102    malloc.c: No such file or directory.                                                                                            
(gdb) bt                                                                                                                                
#0  __GI___libc_free (mem=0x7ffb32dfd010) at malloc.c:3102                                                                              
#1  0x00005555556459fa in alloc::alloc::dealloc (ptr=<optimized out>, layout=...) at /rustc/07dca489ac2d933c78d3c5158e3f43beefeb02ce/library/alloc/src/alloc.rs:117                                                                                                              
#2  alloc::alloc::{impl#1}::deallocate (ptr=..., layout=..., self=<optimized out>) at /rustc/07dca489ac2d933c78d3c5158e3f43beefeb02ce/library/alloc/src/alloc.rs:254                                                                                                             
#3  alloc::boxed::{impl#8}::drop<[core::sync::atomic::AtomicUsize], alloc::alloc::Global> (self=<optimized out>) at /rustc/07dca489ac2d933c78d3c5158e3f43beefeb02ce/library/alloc/src/boxed.rs:1243                                                                              
#4  core::ptr::drop_in_place<alloc::boxed::Box<[core::sync::atomic::AtomicUsize], alloc::alloc::Global>> () at /rustc/07dca489ac2d933c78d3c5158e3f43beefeb02ce/library/core/src/ptr/mod.rs:507                                                                                   
#5  core::ptr::drop_in_place<webgraph::algo::llp::label_store::LabelStore> () at /rustc/07dca489ac2d933c78d3c5158e3f43beefeb02ce/library/core/src/ptr/mod.rs:507                                                                                                                 
#6  webgraph::algo::llp::layered_label_propagation<webgraph::graphs::bvgraph::random_access::BVGraph<webgraph::graphs::bvgraph::codecs::dec_dyn::DynCodesDecoderFactory<dsi_bitstream::traits::endianness::BigEndian, webgraph::graphs::bvgraph::codecs::factories::MemoryFactory
<dsi_bitstream::traits::endianness::BigEndian, webgraph::utils::mmap_helper::MmapHelper<u32, mmap_rs::mmap::Mmap>>, sux::dict::elias_fano::EliasFano<sux::rank_sel::select_fixed2::SelectFixed2<sux::bits::bit_vec::CountBitVec<&[usize]>, &[u64], 8, 2>, sux::bits::bit_field_ve
c::BitFieldVec<usize, &[usize]>>>>, sux::dict::elias_fano::EliasFano<sux::rank_sel::select_zero_fixed2::SelectZeroFixed2<sux::bits::bit_vec::CountBitVec<&[usize]>, &[u64], 8, 2>, sux::bits::bit_field_vec::BitFieldVec<usize, &[usize]>>, predicates::boxed::BoxPredicate<webgr
aph::algo::llp::preds::PredParams>> (sym_graph=<optimized out>, deg_cumul=<optimized out>, gammas=..., num_threads=..., chunk_size=..., granularity=..., seed=0, predicate=...) at src/algo/llp/mod.rs:357                                                                       
#7  0x00005555556c6b4d in webgraph::cli::llp::llp_impl<dsi_bitstream::traits::endianness::BigEndian> (args=...) at src/cli/llp.rs:163   
#8  webgraph::cli::llp::main (submatches=<optimized out>) at src/cli/llp.rs:93                                                          
#9  0x00005555555e222c in webgraph::main () at src/main.rs:70                                                                           
(gdb) f 4                                                                                                                               
#4  core::ptr::drop_in_place<alloc::boxed::Box<[core::sync::atomic::AtomicUsize], alloc::alloc::Global>> () at /rustc/07dca489ac2d933c78d3c5158e3f43beefeb02ce/library/core/src/ptr/mod.rs:507                                                                                   
507     /rustc/07dca489ac2d933c78d3c5158e3f43beefeb02ce/library/core/src/ptr/mod.rs: No such file or directory.                         
(gdb) f                                                                                                                                 
#4  core::ptr::drop_in_place<alloc::boxed::Box<[core::sync::atomic::AtomicUsize], alloc::alloc::Global>> () at /rustc/07dca489ac2d933c78d3c5158e3f43beefeb02ce/library/core/src/ptr/mod.rs:507                                                                                   
507     in /rustc/07dca489ac2d933c78d3c5158e3f43beefeb02ce/library/core/src/ptr/mod.rs                                                  

I assume this is due to the transmuted label_store.labels, but I don't see why the compiler would drop the transmuted before the original, let alone drop it at all. Wrapping in ManuallyDrop doesn't help.

This happens both in release and debug mode (I commented out the worker loop so it terminates within a reasonable time in debug mode)

progval avatar Mar 23 '24 11:03 progval

Doea it happen on nightly?

vigna avatar Mar 23 '24 13:03 vigna

I didn't try, this is on 1.76.0

progval avatar Mar 23 '24 13:03 progval

Hmm I can't try on nightly because of a different issue (stack overflow despite setting ulimit -s 65533 which was enough on stable)

progval avatar Mar 23 '24 13:03 progval

Hah, I forgot RUST_MIN_STACK=8388608. Yes, the segfault still happens on nightly.

progval avatar Mar 23 '24 13:03 progval

Well, we need to get something much smaller and reproducible. But I have the gut feeling this is not gonna be easy.

vigna avatar Apr 11 '24 17:04 vigna