From adcdaa20c6a7c112ca1912f9ee1b9d1157e88b65 Mon Sep 17 00:00:00 2001 From: hanna Date: Tue, 10 Feb 2026 21:03:22 +0000 Subject: [PATCH 1/3] feat: add corruption prevention and integrity checking - Add read-time validation of commit objects in store.rs (verify delta IDs, commit IDs, parent references) - Enable zstd frame checksums on write - Add directory fsync after atomic renames for durability - Validate delta chain consistency in materialize_committed_tree - Reject unsupported Patch delta variants in apply_delta - Defer ref updates in graft to avoid dangling references - Add 'arc check' command (fsck) with orphan detection - Add CorruptObject and UnsupportedDelta error variants --- src/check.rs | 194 ++++++++++++++++++++++++++++++++++++++++++++++++ src/cli.rs | 24 +++++- src/error.rs | 4 + src/main.rs | 1 + src/modify.rs | 59 ++++++++++++++- src/store.rs | 54 +++++++++++++- src/tracking.rs | 25 +++++-- tests/check.rs | 154 ++++++++++++++++++++++++++++++++++++++ 8 files changed, 504 insertions(+), 11 deletions(-) create mode 100644 src/check.rs create mode 100644 tests/check.rs diff --git a/src/check.rs b/src/check.rs new file mode 100644 index 0000000..6eb4d23 --- /dev/null +++ b/src/check.rs @@ -0,0 +1,194 @@ +use std::collections::HashSet; +use std::fs; + +use crate::error::Result; +use crate::model::{CommitId, RefTarget}; +use crate::repo::Repository; +use crate::store; +use crate::tracking; +use crate::ui; + +pub struct CheckReport { + pub commits_checked: usize, + pub refs_checked: usize, + pub errors: Vec, +} + +impl CheckReport { + pub fn is_ok(&self) -> bool { + self.errors.is_empty() + } +} + +impl std::fmt::Display for CheckReport { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + if self.is_ok() { + writeln!( + f, + "{}", + ui::success(&format!( + "repository ok: {} commit(s), {} ref(s) checked", + self.commits_checked, self.refs_checked + )) + ) + } else { + for err in &self.errors { + writeln!(f, "{}", ui::error(err))?; + } + writeln!( + f, + "\n{} error(s) found in {} commit(s), {} ref(s)", + self.errors.len(), + self.commits_checked, + self.refs_checked + ) + } + } +} + +pub fn check(repo: &Repository) -> Result { + debug!(1, "running repository integrity check"); + let mut errors = Vec::new(); + let mut visited = HashSet::new(); + let mut refs_checked = 0usize; + + let bookmark_ids = collect_ref_targets(repo, &repo.bookmarks_dir(), &mut errors); + refs_checked += bookmark_ids.len(); + let tag_ids = collect_ref_targets(repo, &repo.tags_dir(), &mut errors); + refs_checked += tag_ids.len(); + + let head = match repo.load_head() { + Ok(h) => Some(h), + Err(e) => { + errors.push(format!("failed to load HEAD: {e}")); + None + } + }; + + let head_commit = match &head { + Some(crate::model::Head::Attached { commit, .. }) => Some(commit.clone()), + Some(crate::model::Head::Detached { commit }) => Some(commit.clone()), + _ => None, + }; + + let mut all_roots: Vec = Vec::new(); + if let Some(id) = head_commit { + all_roots.push(id); + } + all_roots.extend(bookmark_ids); + all_roots.extend(tag_ids); + + for root in &all_roots { + walk_commits(repo, root, &mut visited, &mut errors); + } + + if let Some(tip) = all_roots.first() { + debug!(2, "verifying delta chain replay from HEAD"); + if let Err(e) = tracking::materialize_committed_tree(repo, tip) { + errors.push(format!("delta chain replay failed: {e}")); + } + } + + let commits_checked = visited.len(); + + let orphans = find_orphan_files(repo, &visited); + for orphan in &orphans { + errors.push(format!("orphan commit object: {orphan}")); + } + + debug!( + 1, + "check complete: {} commit(s), {} ref(s), {} error(s)", + commits_checked, + refs_checked, + errors.len() + ); + + Ok(CheckReport { + commits_checked, + refs_checked, + errors, + }) +} + +fn collect_ref_targets( + repo: &Repository, + dir: &std::path::Path, + errors: &mut Vec, +) -> Vec { + let mut ids = Vec::new(); + let entries = match fs::read_dir(dir) { + Ok(e) => e, + Err(_) => return ids, + }; + + for entry in entries.flatten() { + if !entry.file_type().map(|t| t.is_file()).unwrap_or(false) { + continue; + } + let name = entry.file_name().to_string_lossy().to_string(); + let path = entry.path(); + match fs::read_to_string(&path) { + Ok(contents) => match serde_yaml::from_str::(&contents) { + Ok(ref_target) => { + if let Some(id) = ref_target.commit { + if !store::commit_object_path(repo, &id).exists() { + errors.push(format!("ref '{}' points to missing commit {}", name, id)); + } + ids.push(id); + } + } + Err(e) => errors.push(format!("ref '{}' has invalid format: {}", name, e)), + }, + Err(e) => errors.push(format!("cannot read ref '{}': {}", name, e)), + } + } + + ids +} + +fn walk_commits( + repo: &Repository, + start: &CommitId, + visited: &mut HashSet, + errors: &mut Vec, +) { + let mut queue = vec![start.clone()]; + + while let Some(id) = queue.pop() { + if !visited.insert(id.0.clone()) { + continue; + } + + match store::read_commit_object(repo, &id) { + Ok(obj) => { + for parent in &obj.commit.parents { + queue.push(parent.clone()); + } + } + Err(e) => { + errors.push(format!("commit {}: {}", &id.0[..id.0.len().min(12)], e)); + } + } + } +} + +fn find_orphan_files(repo: &Repository, reachable: &HashSet) -> Vec { + let dir = repo.commits_dir(); + let entries = match fs::read_dir(&dir) { + Ok(e) => e, + Err(_) => return Vec::new(), + }; + + let mut orphans = Vec::new(); + for entry in entries.flatten() { + let name = entry.file_name().to_string_lossy().to_string(); + if let Some(id) = name.strip_suffix(".zst") + && !reachable.contains(id) + { + orphans.push(id.to_string()); + } + } + orphans.sort(); + orphans +} diff --git a/src/cli.rs b/src/cli.rs index ab84e52..a48f52c 100644 --- a/src/cli.rs +++ b/src/cli.rs @@ -5,6 +5,7 @@ use std::sync::atomic::{AtomicU8, Ordering}; use clap::{ArgAction, Parser, Subcommand}; use crate::bridge; +use crate::check; use crate::config; use crate::diff; use crate::ignore::IgnoreRules; @@ -134,6 +135,9 @@ pub enum Command { /// Convert a git repo to an arc repo Migrate, + /// Verify repository integrity + Check, + /// Manage bookmarks Mark { #[command(subcommand)] @@ -580,6 +584,22 @@ pub fn dispatch(cli: Cli) { } } } + Command::Check => { + debug!(1, "command: check"); + let repo = open_repo_or_exit(); + match check::check(&repo) { + Ok(report) => { + print!("{report}"); + if !report.errors.is_empty() { + std::process::exit(1); + } + } + Err(e) => { + eprintln!("{}", ui::error(&e.to_string())); + std::process::exit(1); + } + } + } Command::Mark { command } => { debug!(1, "command: mark"); let repo = open_repo_or_exit(); @@ -882,11 +902,11 @@ fn run_diff(repo: &Repository, range: Option<&str>) -> crate::error::Result write!(f, "not a git repository"), Self::FastForwardOnly(reason) => write!(f, "cannot fast-forward: {reason}"), Self::SigningError(msg) => write!(f, "signing error: {msg}"), + Self::CorruptObject(msg) => write!(f, "corrupt object: {msg}"), + Self::UnsupportedDelta(msg) => write!(f, "unsupported delta format: {msg}"), } } } diff --git a/src/main.rs b/src/main.rs index 3ea7e1a..292b30b 100644 --- a/src/main.rs +++ b/src/main.rs @@ -2,6 +2,7 @@ pub mod ui; pub mod bridge; +pub mod check; mod cli; pub mod config; pub mod diff; diff --git a/src/modify.rs b/src/modify.rs index 1c95059..e7a926e 100644 --- a/src/modify.rs +++ b/src/modify.rs @@ -195,7 +195,7 @@ pub fn graft(repo: &Repository, target: &str, onto: &str) -> Result, + new_tree: &FileTree, +) -> Result { + let parent_tree = if parents.is_empty() { + BTreeMap::new() + } else { + tracking::materialize_committed_tree(repo, &parents[0])? + }; + + let changes = tracking::detect_changes(&parent_tree, new_tree); + + if changes.is_empty() { + return Err(ArcError::NothingToCommit); + } + + let delta_id = store::compute_delta_id(&parents.first().cloned(), &changes)?; + let delta = Delta { + id: delta_id.clone(), + base: parents.first().cloned(), + changes, + }; + + let config = crate::config::load_effective(repo); + let author = match (config.user_name, config.user_email) { + (Some(name), Some(email)) => Some(crate::model::Signature { name, email }), + _ => None, + }; + + let timestamp = std::time::SystemTime::now() + .duration_since(std::time::UNIX_EPOCH) + .map_err(|_| ArcError::ClockError)? + .as_secs() as i64; + + let commit_id = store::compute_commit_id(&parents, &delta_id, message, &author, timestamp)?; + + let commit_obj = crate::model::Commit { + id: commit_id.clone(), + parents: parents.clone(), + delta: delta_id, + message: message.to_string(), + author, + timestamp, + ssh_signature: None, + }; + + let obj = CommitObject { + commit: commit_obj, + delta, + }; + store::write_commit_object(repo, &obj)?; + + Ok(commit_id) +} + fn commit_tree_internal( repo: &Repository, message: &str, diff --git a/src/store.rs b/src/store.rs index 956d303..1d28097 100644 --- a/src/store.rs +++ b/src/store.rs @@ -5,7 +5,7 @@ use std::path::PathBuf; use serde::{Deserialize, Serialize}; use sha2::{Digest, Sha256}; -use crate::error::Result; +use crate::error::{ArcError, Result}; use crate::model::{Commit, CommitId, Delta, DeltaId, FileChange, Signature}; use crate::repo::Repository; @@ -22,8 +22,10 @@ pub fn commit_object_path(repo: &Repository, id: &CommitId) -> PathBuf { pub fn write_commit_object(repo: &Repository, obj: &CommitObject) -> Result<()> { debug!(3, "writing commit object {}", obj.commit.id.0); let msgpack = rmp_serde::to_vec(obj)?; - let compressed = - zstd::stream::encode_all(Cursor::new(&msgpack), 3).map_err(std::io::Error::other)?; + let mut encoder = zstd::Encoder::new(Vec::new(), 3).map_err(std::io::Error::other)?; + encoder.include_checksum(true).map_err(std::io::Error::other)?; + encoder.write_all(&msgpack)?; + let compressed = encoder.finish().map_err(std::io::Error::other)?; let path = commit_object_path(repo, &obj.commit.id); let tmp_path = path.with_extension("zst.tmp"); @@ -31,6 +33,8 @@ pub fn write_commit_object(repo: &Repository, obj: &CommitObject) -> Result<()> f.write_all(&compressed)?; f.sync_all()?; fs::rename(&tmp_path, &path)?; + let parent_dir = std::fs::File::open(repo.commits_dir())?; + parent_dir.sync_all()?; Ok(()) } @@ -45,9 +49,53 @@ pub fn read_commit_object(repo: &Repository, id: &CommitId) -> Result Result<()> { + let expected_delta_id = compute_delta_id(&obj.delta.base, &obj.delta.changes)?; + if expected_delta_id != obj.delta.id { + return Err(ArcError::CorruptObject(format!( + "delta id mismatch: expected {}, found {}", + expected_delta_id, obj.delta.id + ))); + } + if obj.commit.delta != obj.delta.id { + return Err(ArcError::CorruptObject(format!( + "commit references delta {}, but object contains delta {}", + obj.commit.delta, obj.delta.id + ))); + } + let expected_commit_id = compute_commit_id( + &obj.commit.parents, + &obj.delta.id, + &obj.commit.message, + &obj.commit.author, + obj.commit.timestamp, + )?; + if expected_commit_id != obj.commit.id { + return Err(ArcError::CorruptObject(format!( + "commit id mismatch: expected {}, found {}", + expected_commit_id, obj.commit.id + ))); + } + if obj.commit.id != *id { + return Err(ArcError::CorruptObject(format!( + "commit id does not match expected id: expected {}, found {}", + id, obj.commit.id + ))); + } + if obj.delta.base != obj.commit.parents.first().cloned() { + return Err(ArcError::CorruptObject(format!( + "delta base {:?} does not match first parent {:?}", + obj.delta.base, + obj.commit.parents.first() + ))); + } + Ok(()) +} + fn sha256_hex(bytes: &[u8]) -> String { let mut hasher = Sha256::new(); hasher.update(bytes); diff --git a/src/tracking.rs b/src/tracking.rs index 13fc323..ed11544 100644 --- a/src/tracking.rs +++ b/src/tracking.rs @@ -66,8 +66,16 @@ pub fn materialize_committed_tree(repo: &Repository, head: &CommitId) -> Result< debug!(3, "materializing tree at commit {}", head.0); let history = load_linear_history(repo, head)?; let mut tree = BTreeMap::new(); + let mut expected_base: Option = None; for obj in &history { - apply_delta(&mut tree, &obj.delta); + if obj.delta.base != expected_base { + return Err(crate::error::ArcError::CorruptObject(format!( + "delta chain broken at commit {}", + obj.commit.id.0 + ))); + } + apply_delta(&mut tree, &obj.delta)?; + expected_base = Some(obj.commit.id.clone()); } debug!(3, "materialized tree with {} file(s)", tree.len()); Ok(tree) @@ -93,14 +101,20 @@ pub fn load_linear_history(repo: &Repository, head: &CommitId) -> Result crate::error::Result<()> { for change in &delta.changes { match &change.kind { - FileChangeKind::Add { content } | FileChangeKind::Modify { content } => { - if let FileContentDelta::Full { bytes } = content { + FileChangeKind::Add { content } | FileChangeKind::Modify { content } => match content { + FileContentDelta::Full { bytes } => { tree.insert(change.path.clone(), bytes.clone()); } - } + FileContentDelta::Patch { format, .. } => { + return Err(crate::error::ArcError::UnsupportedDelta(format!( + "patch format '{}' on file '{}'", + format, change.path + ))); + } + }, FileChangeKind::Delete => { tree.remove(&change.path); } @@ -111,6 +125,7 @@ pub fn apply_delta(tree: &mut FileTree, delta: &Delta) { } } } + Ok(()) } pub fn detect_changes(committed: &FileTree, worktree: &FileTree) -> Vec { diff --git a/tests/check.rs b/tests/check.rs new file mode 100644 index 0000000..231a554 --- /dev/null +++ b/tests/check.rs @@ -0,0 +1,154 @@ +use std::process::Command; +use tempfile::TempDir; + +fn arc_cmd() -> Command { + let mut cmd = Command::new(env!("CARGO_BIN_EXE_arc")); + cmd.env("NO_COLOR", "1"); + cmd +} + +fn init_repo() -> TempDir { + let dir = TempDir::new().unwrap(); + arc_cmd() + .arg("init") + .current_dir(dir.path()) + .output() + .expect("failed to init"); + dir +} + +fn commit_file(dir: &TempDir, name: &str, content: &str, msg: &str) { + std::fs::write(dir.path().join(name), content).unwrap(); + let output = arc_cmd() + .args(["commit", msg]) + .current_dir(dir.path()) + .output() + .expect("failed to commit"); + assert!(output.status.success()); +} + +#[test] +fn check_clean_repo_succeeds() { + let dir = init_repo(); + commit_file(&dir, "a.txt", "hello\n", "initial"); + + let output = arc_cmd() + .arg("check") + .current_dir(dir.path()) + .output() + .expect("failed to run check"); + + assert!(output.status.success()); + let stdout = String::from_utf8_lossy(&output.stdout); + assert!(stdout.contains("repository ok")); +} + +#[test] +fn check_multi_commit_repo() { + let dir = init_repo(); + commit_file(&dir, "a.txt", "hello\n", "first"); + commit_file(&dir, "b.txt", "world\n", "second"); + commit_file(&dir, "a.txt", "updated\n", "third"); + + let output = arc_cmd() + .arg("check") + .current_dir(dir.path()) + .output() + .expect("failed to run check"); + + assert!(output.status.success()); + let stdout = String::from_utf8_lossy(&output.stdout); + assert!(stdout.contains("3 commit(s)")); +} + +#[test] +fn check_detects_corrupt_commit_file() { + let dir = init_repo(); + commit_file(&dir, "a.txt", "hello\n", "initial"); + + let commits_dir = dir.path().join(".arc").join("commits"); + let entries: Vec<_> = std::fs::read_dir(&commits_dir) + .unwrap() + .flatten() + .collect(); + assert_eq!(entries.len(), 1); + + let commit_path = entries[0].path(); + std::fs::write(&commit_path, b"corrupted data").unwrap(); + + let output = arc_cmd() + .arg("check") + .current_dir(dir.path()) + .output() + .expect("failed to run check"); + + assert!(!output.status.success()); +} + +#[test] +fn check_detects_missing_commit_from_ref() { + let dir = init_repo(); + commit_file(&dir, "a.txt", "hello\n", "initial"); + + let commits_dir = dir.path().join(".arc").join("commits"); + let entries: Vec<_> = std::fs::read_dir(&commits_dir) + .unwrap() + .flatten() + .collect(); + for entry in entries { + std::fs::remove_file(entry.path()).unwrap(); + } + + let output = arc_cmd() + .arg("check") + .current_dir(dir.path()) + .output() + .expect("failed to run check"); + + assert!(!output.status.success()); + let stdout = String::from_utf8_lossy(&output.stdout); + assert!(stdout.contains("missing commit") || stdout.contains("error")); +} + +#[test] +fn check_with_bookmarks_and_tags() { + let dir = init_repo(); + commit_file(&dir, "a.txt", "hello\n", "initial"); + + arc_cmd() + .args(["mark", "add", "feature"]) + .current_dir(dir.path()) + .output() + .expect("failed"); + + arc_cmd() + .args(["tag", "add", "v1"]) + .current_dir(dir.path()) + .output() + .expect("failed"); + + let output = arc_cmd() + .arg("check") + .current_dir(dir.path()) + .output() + .expect("failed to run check"); + + assert!(output.status.success()); + let stdout = String::from_utf8_lossy(&output.stdout); + assert!(stdout.contains("repository ok")); +} + +#[test] +fn check_empty_repo() { + let dir = init_repo(); + + let output = arc_cmd() + .arg("check") + .current_dir(dir.path()) + .output() + .expect("failed to run check"); + + assert!(output.status.success()); + let stdout = String::from_utf8_lossy(&output.stdout); + assert!(stdout.contains("repository ok")); +} From a6b1a027c8f0a78d774c9d0575a844627567828a Mon Sep 17 00:00:00 2001 From: hanna Date: Tue, 10 Feb 2026 21:03:53 +0000 Subject: [PATCH 2/3] refactor: switch storage format from MessagePack to bincode - Replace rmp-serde with bincode 1.x in Cargo.toml - Update store.rs serialization/deserialization and ID hashing - Rename model.rs helpers from to_msgpack/from_msgpack to to_bytes/from_bytes - Consolidate MsgPack/MsgPackDecode error variants into single Bincode variant - Remove skip_serializing_if on ssh_signature (incompatible with bincode) - Update all documentation to reflect bincode storage format --- Cargo.lock | 30 ++++++++++-------------------- Cargo.toml | 2 +- README.md | 4 ++-- docs/architecture.md | 6 +++--- docs/commands.md | 2 +- docs/git-bridge.md | 2 +- docs/spec.md | 4 ++-- src/error.rs | 18 +++++------------- src/model.rs | 17 ++++++++--------- src/store.rs | 14 +++++++------- 10 files changed, 40 insertions(+), 59 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 6049c1b..77d04aa 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -56,11 +56,11 @@ dependencies = [ name = "arc" version = "0.1.0" dependencies = [ + "bincode", "clap", "colored", "git2", "hex", - "rmp-serde", "serde", "serde_yaml", "sha2", @@ -87,6 +87,15 @@ version = "1.8.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2af50177e190e07a26ab74f8b1efbfe2ef87da2116221318cb1c2e82baf7de06" +[[package]] +name = "bincode" +version = "1.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b1f45e9417d87227c7a56d22e471c6206462cba514c7590c09aff4cf6d1ddcad" +dependencies = [ + "serde", +] + [[package]] name = "bitflags" version = "2.10.0" @@ -954,25 +963,6 @@ dependencies = [ "subtle", ] -[[package]] -name = "rmp" -version = "0.8.15" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4ba8be72d372b2c9b35542551678538b562e7cf86c3315773cae48dfbfe7790c" -dependencies = [ - "num-traits", -] - -[[package]] -name = "rmp-serde" -version = "1.3.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "72f81bee8c8ef9b577d1681a70ebbc962c232461e397b22c208c43c04b67a155" -dependencies = [ - "rmp", - "serde", -] - [[package]] name = "rsa" version = "0.9.10" diff --git a/Cargo.toml b/Cargo.toml index 1d3991e..56b6556 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -7,7 +7,7 @@ edition = "2024" clap = { version = "4", features = ["derive"] } serde = { version = "1", features = ["derive"] } serde_yaml = "0.9" -rmp-serde = "1" +bincode = "1" zstd = "0.13" sha2 = "0.10" hex = "0.4" diff --git a/README.md b/README.md index 9ff9501..d217eb0 100644 --- a/README.md +++ b/README.md @@ -6,7 +6,7 @@ A delta-based version control system written in Rust. Unlike Git's snapshot-based model, Arc stores incremental deltas using -ZSTD-compressed MessagePack files. Changes are automatically tracked +ZSTD-compressed bincode files. Changes are automatically tracked without manual staging, and commits are immutable once created. Arc uses a **bookmark** system instead of branches, and bridges to Git @@ -14,7 +14,7 @@ remotes for push, pull, clone, and sync operations via `libgit2`. ## Features -- Incremental delta storage (ZSTD + MessagePack) +- Incremental delta storage (ZSTD + bincode) - Automatic change tracking (no staging step) - Bookmarks and immutable tags - Named stashes diff --git a/docs/architecture.md b/docs/architecture.md index 25c7ce1..558556d 100644 --- a/docs/architecture.md +++ b/docs/architecture.md @@ -11,7 +11,7 @@ An arc repository keeps all state in an `.arc/` directory at the worktree root: |------|--------|---------| | `HEAD` | YAML | Current state — one of three variants: **unborn** (no commits yet; has `bookmark`), **attached** (on a bookmark; has `bookmark` + `commit`), or **detached** (raw commit; has `commit`). | | `config.yml` | YAML | Local repository configuration. | -| `commits/.zst` | Zstandard-compressed MessagePack | Commit objects. Each file contains a `CommitObject` that bundles a `Commit` and its `Delta`. | +| `commits/.zst` | Zstandard-compressed bincode | Commit objects. Each file contains a `CommitObject` that bundles a `Commit` and its `Delta`. | | `bookmarks/.yml` | YAML | One file per bookmark. Contains a `RefTarget` with an optional `commit` field. | | `tags/.yml` | YAML | Same format as bookmarks. | | `stashes/state.yml` | YAML | Tracks the active stash. | @@ -42,9 +42,9 @@ hex hashes. ## Storage (`src/store.rs`) `CommitObject` bundles a `Commit` and its `Delta` into a single unit that is -serialized as MessagePack, then compressed with Zstandard at level 3. Files are +serialized with bincode, then compressed with Zstandard at level 3. Files are written atomically (write to `.tmp`, then rename). IDs are computed by SHA-256 -hashing the MessagePack-serialized content-addressable data. +hashing the bincode-serialized content-addressable data. ## Tracking (`src/tracking.rs`) diff --git a/docs/commands.md b/docs/commands.md index c9df7a5..a0dd597 100644 --- a/docs/commands.md +++ b/docs/commands.md @@ -14,7 +14,7 @@ Initialize a new arc repository. Creates the `.arc/` directory structure includi ### `arc commit ` -Commit all current changes. No staging area is needed — changes are detected automatically by comparing the worktree to the last commit. Creates a ZSTD-compressed MessagePack commit object in `.arc/commits/`. If a signing key is configured (`user.key`), the commit is signed with SSH. +Commit all current changes. No staging area is needed — changes are detected automatically by comparing the worktree to the last commit. Creates a ZSTD-compressed bincode commit object in `.arc/commits/`. If a signing key is configured (`user.key`), the commit is signed with SSH. ### `arc status` diff --git a/docs/git-bridge.md b/docs/git-bridge.md index cd205fa..155e92a 100644 --- a/docs/git-bridge.md +++ b/docs/git-bridge.md @@ -1,6 +1,6 @@ # Git Bridge -Arc uses an internal git bridge to interoperate with git remotes. Since Arc uses its own delta-based storage format (ZSTD-compressed MessagePack), it maintains a shadow bare git repository to translate between formats when communicating with git servers. +Arc uses an internal git bridge to interoperate with git remotes. Since Arc uses its own delta-based storage format (ZSTD-compressed bincode), it maintains a shadow bare git repository to translate between formats when communicating with git servers. ## Shadow Repository diff --git a/docs/spec.md b/docs/spec.md index c59ef8f..129f392 100644 --- a/docs/spec.md +++ b/docs/spec.md @@ -20,7 +20,7 @@ This is an overview of the foundational rules that make the software. 8a. use `feat: ` for new features, `fix: ` for bug fixes, `refactor: ` for changes. 8b. use `docs: ` for docs changes, `build: ` for build system changes, etc. 9. Anything involving remotes should use `libgit` or `git2` libraries for compatibility. -10. Deltas should be stored using ZSTD compressed Messagepack files for easy storage. +10. Deltas should be stored using ZSTD compressed bincode files for easy storage. 11. When pushing, pulling, and fetching from remotes, it should be bridged to git. 12. Lastly, it should cover 90% of use cases that git has, for full feature support. 13. Arc should support **optional** commit signing via SSH keys. @@ -117,7 +117,7 @@ These are the implementation phases that should be implemented incrementally. 1. **Project scaffolding** - Nix flake, direnv, Rust project structure, CLI skeleton with clap, help 2. **Core repo structure** - init, internal data model (commits, deltas, YAML config), .arcignore -3. **Tracking & committing** - commit, status, diff, auto-change detection, ZSTD + MessagePack storage +3. **Tracking & committing** - commit, status, diff, auto-change detection, ZSTD + bincode storage 4. **History & inspection** - log, show, history, state reconstruction from delta chains 5. **Bookmarks & tags** - mark commands, tag commands, and switch command 6. **Undo & modification** - revert, reset, graft, three-way merge diff --git a/src/error.rs b/src/error.rs index 67ad30b..4728e12 100644 --- a/src/error.rs +++ b/src/error.rs @@ -5,8 +5,7 @@ use std::io; pub enum ArcError { Io(io::Error), Yaml(serde_yaml::Error), - MsgPack(rmp_serde::encode::Error), - MsgPackDecode(rmp_serde::decode::Error), + Bincode(Box), RepoNotFound, RepoAlreadyExists, InvalidPath(String), @@ -50,8 +49,7 @@ impl fmt::Display for ArcError { match self { Self::Io(e) => write!(f, "io error: {e}"), Self::Yaml(e) => write!(f, "yaml error: {e}"), - Self::MsgPack(e) => write!(f, "msgpack encode error: {e}"), - Self::MsgPackDecode(e) => write!(f, "msgpack decode error: {e}"), + Self::Bincode(e) => write!(f, "bincode error: {e}"), Self::RepoNotFound => write!(f, "not an arc repository (or any parent)"), Self::RepoAlreadyExists => { write!(f, "arc repository already exists in this directory") @@ -119,15 +117,9 @@ impl From for ArcError { } } -impl From for ArcError { - fn from(e: rmp_serde::encode::Error) -> Self { - Self::MsgPack(e) - } -} - -impl From for ArcError { - fn from(e: rmp_serde::decode::Error) -> Self { - Self::MsgPackDecode(e) +impl From> for ArcError { + fn from(e: Box) -> Self { + Self::Bincode(e) } } diff --git a/src/model.rs b/src/model.rs index 3cf6f72..7d25021 100644 --- a/src/model.rs +++ b/src/model.rs @@ -40,7 +40,6 @@ pub struct Commit { pub message: String, pub author: Option, pub timestamp: i64, - #[serde(skip_serializing_if = "Option::is_none", default)] pub ssh_signature: Option, } @@ -94,21 +93,21 @@ pub struct RefTarget { } impl Commit { - pub fn to_msgpack(&self) -> crate::error::Result> { - Ok(rmp_serde::to_vec(self)?) + pub fn to_bytes(&self) -> crate::error::Result> { + Ok(bincode::serialize(self)?) } - pub fn from_msgpack(bytes: &[u8]) -> crate::error::Result { - Ok(rmp_serde::from_slice(bytes)?) + pub fn from_bytes(bytes: &[u8]) -> crate::error::Result { + Ok(bincode::deserialize(bytes)?) } } impl Delta { - pub fn to_msgpack(&self) -> crate::error::Result> { - Ok(rmp_serde::to_vec(self)?) + pub fn to_bytes(&self) -> crate::error::Result> { + Ok(bincode::serialize(self)?) } - pub fn from_msgpack(bytes: &[u8]) -> crate::error::Result { - Ok(rmp_serde::from_slice(bytes)?) + pub fn from_bytes(bytes: &[u8]) -> crate::error::Result { + Ok(bincode::deserialize(bytes)?) } } diff --git a/src/store.rs b/src/store.rs index 1d28097..05112a8 100644 --- a/src/store.rs +++ b/src/store.rs @@ -21,10 +21,10 @@ pub fn commit_object_path(repo: &Repository, id: &CommitId) -> PathBuf { pub fn write_commit_object(repo: &Repository, obj: &CommitObject) -> Result<()> { debug!(3, "writing commit object {}", obj.commit.id.0); - let msgpack = rmp_serde::to_vec(obj)?; + let encoded = bincode::serialize(obj)?; let mut encoder = zstd::Encoder::new(Vec::new(), 3).map_err(std::io::Error::other)?; encoder.include_checksum(true).map_err(std::io::Error::other)?; - encoder.write_all(&msgpack)?; + encoder.write_all(&encoded)?; let compressed = encoder.finish().map_err(std::io::Error::other)?; let path = commit_object_path(repo, &obj.commit.id); @@ -44,11 +44,11 @@ pub fn read_commit_object(repo: &Repository, id: &CommitId) -> Result { pub fn compute_delta_id(base: &Option, changes: &[FileChange]) -> Result { debug!(3, "computing delta id (base: {:?})", base); let hashable = DeltaForHash { base, changes }; - let bytes = rmp_serde::to_vec(&hashable) + let bytes = bincode::serialize(&hashable) .map_err(|e| crate::error::ArcError::HashError(e.to_string()))?; Ok(DeltaId(sha256_hex(&bytes))) } @@ -140,7 +140,7 @@ pub fn compute_commit_id( author, timestamp, }; - let bytes = rmp_serde::to_vec(&hashable) + let bytes = bincode::serialize(&hashable) .map_err(|e| crate::error::ArcError::HashError(e.to_string()))?; Ok(CommitId(sha256_hex(&bytes))) } From 6f307c139bf55e04f73ac08700a1928921b6bf9c Mon Sep 17 00:00:00 2001 From: hanna Date: Tue, 10 Feb 2026 21:09:58 +0000 Subject: [PATCH 3/3] feat: add backwards compatibility for legacy msgpack storage format --- Cargo.lock | 20 +++++++ Cargo.toml | 1 + src/store.rs | 148 +++++++++++++++++++++++++++++++++++++++++++++++++-- 3 files changed, 166 insertions(+), 3 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 77d04aa..d418e2a 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -61,6 +61,7 @@ dependencies = [ "colored", "git2", "hex", + "rmp-serde", "serde", "serde_yaml", "sha2", @@ -963,6 +964,25 @@ dependencies = [ "subtle", ] +[[package]] +name = "rmp" +version = "0.8.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4ba8be72d372b2c9b35542551678538b562e7cf86c3315773cae48dfbfe7790c" +dependencies = [ + "num-traits", +] + +[[package]] +name = "rmp-serde" +version = "1.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "72f81bee8c8ef9b577d1681a70ebbc962c232461e397b22c208c43c04b67a155" +dependencies = [ + "rmp", + "serde", +] + [[package]] name = "rsa" version = "0.9.10" diff --git a/Cargo.toml b/Cargo.toml index 56b6556..c25bcef 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -8,6 +8,7 @@ clap = { version = "4", features = ["derive"] } serde = { version = "1", features = ["derive"] } serde_yaml = "0.9" bincode = "1" +rmp-serde = "1" zstd = "0.13" sha2 = "0.10" hex = "0.4" diff --git a/src/store.rs b/src/store.rs index 05112a8..f41dbbf 100644 --- a/src/store.rs +++ b/src/store.rs @@ -15,6 +15,62 @@ pub struct CommitObject { pub delta: Delta, } +mod legacy { + use serde::{Deserialize, Serialize}; + + use crate::model::{CommitId, Delta, DeltaId, FileChange, Signature}; + + #[derive(Deserialize)] + pub struct LegacyCommit { + pub id: CommitId, + pub parents: Vec, + pub delta: DeltaId, + pub message: String, + pub author: Option, + pub timestamp: i64, + #[serde(skip_serializing_if = "Option::is_none", default)] + pub ssh_signature: Option, + } + + #[derive(Deserialize)] + pub struct LegacyCommitObject { + pub commit: LegacyCommit, + pub delta: Delta, + } + + #[derive(Serialize)] + pub struct DeltaForHash<'a> { + pub base: &'a Option, + pub changes: &'a [FileChange], + } + + #[derive(Serialize)] + pub struct CommitForHash<'a> { + pub parents: &'a [CommitId], + pub delta: &'a DeltaId, + pub message: &'a str, + pub author: &'a Option, + pub timestamp: i64, + } + + impl LegacyCommitObject { + pub fn into_commit_object(self) -> super::CommitObject { + super::CommitObject { + commit: crate::model::Commit { + id: self.commit.id, + parents: self.commit.parents, + delta: self.commit.delta, + message: self.commit.message, + author: self.commit.author, + timestamp: self.commit.timestamp, + ssh_signature: self.commit.ssh_signature, + }, + delta: self.delta, + } + } + } +} + pub fn commit_object_path(repo: &Repository, id: &CommitId) -> PathBuf { repo.commits_dir().join(format!("{}.zst", id.0)) } @@ -48,9 +104,26 @@ pub fn read_commit_object(repo: &Repository, id: &CommitId) -> Result(&decoded) { + Ok(obj) => { + validate_commit_object(&obj, id)?; + Ok(obj) + } + Err(bincode_err) => { + debug!(3, "bincode failed, trying legacy msgpack for {}", id.0); + match rmp_serde::from_slice::(&decoded) { + Ok(legacy_obj) => { + let obj = legacy_obj.into_commit_object(); + validate_legacy_commit_object(&obj, id)?; + Ok(obj) + } + Err(msgpack_err) => Err(ArcError::CorruptObject(format!( + "failed to decode object (bincode: {bincode_err}, msgpack: {msgpack_err})" + ))), + } + } + } } fn validate_commit_object(obj: &CommitObject, id: &CommitId) -> Result<()> { @@ -96,6 +169,75 @@ fn validate_commit_object(obj: &CommitObject, id: &CommitId) -> Result<()> { Ok(()) } +fn validate_legacy_commit_object(obj: &CommitObject, id: &CommitId) -> Result<()> { + let expected_delta_id = compute_legacy_delta_id(&obj.delta.base, &obj.delta.changes)?; + if expected_delta_id != obj.delta.id { + return Err(ArcError::CorruptObject(format!( + "delta id mismatch: expected {}, found {}", + expected_delta_id, obj.delta.id + ))); + } + if obj.commit.delta != obj.delta.id { + return Err(ArcError::CorruptObject(format!( + "commit references delta {}, but object contains delta {}", + obj.commit.delta, obj.delta.id + ))); + } + let expected_commit_id = compute_legacy_commit_id( + &obj.commit.parents, + &obj.delta.id, + &obj.commit.message, + &obj.commit.author, + obj.commit.timestamp, + )?; + if expected_commit_id != obj.commit.id { + return Err(ArcError::CorruptObject(format!( + "commit id mismatch: expected {}, found {}", + expected_commit_id, obj.commit.id + ))); + } + if obj.commit.id != *id { + return Err(ArcError::CorruptObject(format!( + "commit id does not match expected id: expected {}, found {}", + id, obj.commit.id + ))); + } + if obj.delta.base != obj.commit.parents.first().cloned() { + return Err(ArcError::CorruptObject(format!( + "delta base {:?} does not match first parent {:?}", + obj.delta.base, + obj.commit.parents.first() + ))); + } + Ok(()) +} + +fn compute_legacy_delta_id(base: &Option, changes: &[FileChange]) -> Result { + let hashable = legacy::DeltaForHash { base, changes }; + let bytes = rmp_serde::to_vec(&hashable) + .map_err(|e| ArcError::HashError(e.to_string()))?; + Ok(DeltaId(sha256_hex(&bytes))) +} + +fn compute_legacy_commit_id( + parents: &[CommitId], + delta: &DeltaId, + message: &str, + author: &Option, + timestamp: i64, +) -> Result { + let hashable = legacy::CommitForHash { + parents, + delta, + message, + author, + timestamp, + }; + let bytes = rmp_serde::to_vec(&hashable) + .map_err(|e| ArcError::HashError(e.to_string()))?; + Ok(CommitId(sha256_hex(&bytes))) +} + fn sha256_hex(bytes: &[u8]) -> String { let mut hasher = Sha256::new(); hasher.update(bytes);