ED_LRR/rust/src/mmap_csv.rs

70 lines
2.5 KiB
Rust

use crate::common::{EdLrrError, EdLrrResult, System};
use crate::info;
use csv_core::{ReadFieldResult, Reader};
use memmap::Mmap;
use std::collections::HashMap;
use std::fs::File;
use std::path::Path;
pub fn mmap_csv(path: &Path, query: Vec<String>) -> Result<HashMap<String, Option<u32>>, String> {
let file = File::open(path).map_err(|e| e.to_string())?;
let mm = unsafe { Mmap::map(&file) }.map_err(|e| e.to_string())?;
let mut best = query
.iter()
.map(|s| (s, (s.as_bytes(), usize::MAX, u32::MAX)))
.collect::<Vec<(&String, (_, usize, u32))>>();
let t_start = std::time::Instant::now();
let dist = eddie::slice::DamerauLevenshtein::new();
let mut row = 0;
{
let mut data = &mm[..];
let mut rdr = Reader::new();
let mut field = [0; 1024];
let mut fieldidx = 0;
loop {
let (result, nread, nwrite) = rdr.read_field(data, &mut field);
data = &data[nread..];
let field = &field[..nwrite];
match result {
ReadFieldResult::InputEmpty => {}
ReadFieldResult::OutputFull => {
return Err("Encountered field larget than 1024 bytes!".to_string());
}
ReadFieldResult::Field { record_end } => {
if fieldidx == 1 {
for (_, (name_b, best_dist, id)) in best.iter_mut() {
let d = dist.distance(name_b, field);
if d < *best_dist {
*best_dist = d;
*id = row;
}
}
}
if record_end {
fieldidx = 0;
row += 1;
} else {
fieldidx += 1;
}
}
// This case happens when the CSV reader has successfully exhausted
// all input.
ReadFieldResult::End => {
break;
}
}
}
}
let search_result = best
.drain(..)
.map(|(query_name, (_, _, idx))| (query_name.clone(), Some(idx)))
.collect::<HashMap<String, Option<u32>>>();
let rate = (row as f64) / t_start.elapsed().as_secs_f64();
info!(
"Took: {:.2?}, {:.2} systems/second",
t_start.elapsed(),
rate
);
Ok(search_result)
}