diff --git a/.github/workflows/python-publish.yml b/.github/workflows/python-publish.yml new file mode 100644 index 000000000..aa608ef2d --- /dev/null +++ b/.github/workflows/python-publish.yml @@ -0,0 +1,143 @@ +name: Build and Publish Python Package + +on: + push: + tags: + - 'v*' + +# Add top-level permissions block +permissions: + id-token: write + contents: read + +jobs: + macos: + runs-on: macos-latest + # Add permissions to job + permissions: + id-token: write + contents: read + strategy: + matrix: + python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"] + target: [x86_64, aarch64] + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-python@v4 + with: + python-version: ${{ matrix.python-version }} + - name: Build wheels + uses: PyO3/maturin-action@v1 + with: + target: ${{ matrix.target }} + args: --release --out dist + sccache: 'true' + - name: Upload wheels + uses: actions/upload-artifact@v3 + with: + name: wheels + path: dist/*.whl + if-no-files-found: error + + windows: + runs-on: windows-latest + # Add permissions to job + permissions: + id-token: write + contents: read + strategy: + matrix: + python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"] + target: [x64] + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-python@v4 + with: + python-version: ${{ matrix.python-version }} + architecture: ${{ matrix.target }} + - name: Build wheels + uses: PyO3/maturin-action@v1 + with: + args: --release --out dist + sccache: 'true' + - name: Upload wheels + uses: actions/upload-artifact@v3 + with: + name: wheels + path: dist/*.whl + if-no-files-found: error + + linux: + runs-on: ubuntu-latest + # Add permissions to job + permissions: + id-token: write + contents: read + strategy: + matrix: + python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"] + target: [x86_64] + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-python@v4 + with: + python-version: ${{ matrix.python-version }} + - name: Install dependencies + run: | + python -m pip install --user cffi + python -m pip install --user patchelf + - name: Build wheels + uses: PyO3/maturin-action@v1 + with: + target: ${{ matrix.target }} + manylinux: auto + args: --release --out dist + sccache: 'true' + - name: Upload wheels + uses: actions/upload-artifact@v3 + with: + name: wheels + path: dist/*.whl + if-no-files-found: error + + sdist: + runs-on: ubuntu-latest + # Add permissions to job + permissions: + id-token: write + contents: read + steps: + - uses: actions/checkout@v4 + - name: Build sdist + uses: PyO3/maturin-action@v1 + with: + command: sdist + args: --out dist + - name: Upload sdist + uses: actions/upload-artifact@v3 + with: + name: wheels + path: dist/*.tar.gz + if-no-files-found: error + + release: + name: Release + runs-on: ubuntu-latest + needs: [macos, windows, linux, sdist] + # Keep existing permissions + permissions: + id-token: write + contents: read + steps: + - uses: actions/download-artifact@v3 + with: + name: wheels + path: dist + - name: Display structure of downloaded files + run: ls -R dist + - name: Publish to PyPI + uses: pypa/gh-action-pypi-publish@release/v1 + with: + packages-dir: dist/ + verbose: true + print-hash: true \ No newline at end of file diff --git a/Cargo.toml b/Cargo.toml index 99d8fec40..49056980b 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -8,7 +8,11 @@ license = "GPL-3.0" name = "self_encryption" readme = "README.md" repository = "https://github.com/maidsafe/self_encryption" -version = "0.30.0" +version = "0.30.263" + +[features] +default = [] +python = ["pyo3/extension-module"] [dependencies] aes = "~0.8.1" @@ -23,6 +27,7 @@ num_cpus = "1.13.0" itertools = "~0.10.0" tempfile = "3.6.0" xor_name = "5.0.0" +pyo3 = { version = "0.19", optional = true, features = ["extension-module"] } [dependencies.brotli] version = "~3.3.0" @@ -64,3 +69,7 @@ name = "basic_encryptor" [[bench]] name = "lib" harness = false + +[lib] +name = "self_encryption" +crate-type = ["cdylib", "rlib"] diff --git a/README.md b/README.md index a8c237cda..288be8e79 100644 --- a/README.md +++ b/README.md @@ -11,47 +11,106 @@ Self encrypting files (convergent encryption plus obfuscation) ## Overview -A version of [convergent encryption](http://en.wikipedia.org/wiki/Convergent_encryption) with an additional obfuscation step. This pattern allows secured data that can also be [de-duplicated](http://en.wikipedia.org/wiki/Data_deduplication). This library presents an API that takes a set of bytes and returns a secret key derived from those bytes, and a set of encrypted chunks. -A reverse function is provided, where the pair returned from encryption (secret key and encrypted chunks) is passed in, returning the original bytes. -There is also the possibility to seek the original bytes in the contents of the encrypted chunks, by calling the seek helper function to produce information used to locate the relevant chunks, and then call the decrypt_range api with the chunks, the secret key and seek information from the previous step. +A version of [convergent encryption](http://en.wikipedia.org/wiki/convergent_encryption) with an additional obfuscation step. This pattern allows secured data that can also be [de-duplicated](http://en.wikipedia.org/wiki/Data_deduplication). This library presents an API that takes a set of bytes and returns a secret key derived from those bytes, and a set of encrypted chunks. -There is an important aspect to note: - -This library provides very secure encryption of the data, and the returned encrypted chunks can be considered as safe as if encrypted by any other modern encryption algorithm. -**However** the returned secret key **requires the same secure handling as would be necessary for any secret key**. +**Important Security Note**: While this library provides very secure encryption of the data, the returned secret key **requires the same secure handling as would be necessary for any secret key**. ![image of self encryption](https://github.com/maidsafe/self_encryption/blob/master/img/self_encryption.png?raw=true) -## Video of the process -[self_encryption process and use case video](https://www.youtube.com/watch?v=Jnvwv4z17b4) +## Documentation +- [Self Encrypting Data Whitepaper](https://docs.maidsafe.net/Whitepapers/pdf/SelfEncryptingData.pdf) +- [Process Overview Video](https://www.youtube.com/watch?v=Jnvwv4z17b4) + +## Usage + +The library can be used through either Rust or Python interfaces. + +### Rust Usage + +#### Installation + +Add this to your `Cargo.toml`: +```toml +[dependencies] +self_encryption = "0.30" +``` + +#### Example Using Basic Encryptor + +```bash +# Encrypt a file +cargo run --example basic_encryptor -- -e + +# Decrypt a file +cargo run --example basic_encryptor -- -d +``` + +### Python Usage + +#### Installation + +```bash +pip install self-encryption +``` + +#### Basic In-Memory Example + +```python +from self_encryption import encrypt_bytes, decrypt_chunks + +# Create test data (must be at least 3 bytes) +data = b"Hello World" * 1024 -## Whitepaper +# Encrypt the data +data_map, chunks = encrypt_bytes(data) -[Self Encrypting Data](https://docs.maidsafe.net/Whitepapers/pdf/SelfEncryptingData.pdf), David Irvine, First published September 2010, Revised June 2015. +# Decrypt and verify +decrypted = decrypt_chunks(data_map, chunks) +assert data == decrypted +``` -## Examples +#### File-Based Example with Chunk Storage -### Using `self_encryptor` +```python +from self_encryption import encrypt_file, decrypt_from_files -This library splits a set of bytes into encrypted chunks and also produces a secret key for the same. This secret key allows the file to be reconstituted. Instructions to use the 'basic_encryptor' example are as follows: +# Encrypt file and store chunks +data_map, chunk_files = encrypt_file("input.txt", "chunks_dir") -##### Encrypt a file: +# Decrypt from stored chunks +decrypt_from_files("chunks_dir", data_map, "output.txt") +``` - cargo run --example basic_encryptor -- -e +#### Streaming Interface Example -You should now have the example binary in `../self_encryption/target/debug/examples/`. The `secret_key` for the given file and it's encrypted chunks will be written to the current directory. +```python +from self_encryption import StreamSelfEncryptor, StreamSelfDecryptor -##### Decrypt a file: +# Stream encryption +encryptor = StreamSelfEncryptor("input_file.dat", chunk_dir="chunks_dir") +chunks = [] +data_map = None - cargo run --example basic_encryptor -- -d +while True: + chunk, maybe_data_map = encryptor.next_encryption() + if chunk is None: + data_map = maybe_data_map + break + chunks.append(chunk) -This will restore the original file to the given destination path. +# Stream decryption +decryptor = StreamSelfDecryptor("output_file.dat", data_map) +for chunk in chunks: + is_complete = decryptor.next_encrypted(chunk) + if is_complete: + break +``` ## License Licensed under the General Public License (GPL), version 3 ([LICENSE](LICENSE) http://www.gnu.org/licenses/gpl-3.0.en.html). -### Linking exception +### Linking Exception self_encryption is licensed under GPLv3 with linking exception. This means you can link to and use the library from any program, proprietary or open source; paid or gratis. However, if you modify self_encryption, you must distribute the source to your modified version under the terms of the GPLv3. diff --git a/examples/basic_encryptor.rs b/examples/basic_encryptor.rs index ee1fb2b49..9a1a4aa09 100644 --- a/examples/basic_encryptor.rs +++ b/examples/basic_encryptor.rs @@ -151,7 +151,8 @@ async fn main() { let result = encrypted_chunks .par_iter() - .map(|c| (c, storage.clone())) + .enumerate() + .map(|(_, c)| (c, storage.clone())) .map(|(c, store)| store.put(XorName::from_content(&c.content), c.content.clone())) .collect::>(); @@ -195,7 +196,6 @@ async fn main() { Ok::<(_, _), Error>(( key.clone(), EncryptedChunk { - index: key.index, content: storage.get(key.dst_hash)?, }, )) diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 000000000..70c8a186b --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,19 @@ +[build-system] +requires = ["maturin>=1.0,<2.0"] +build-backend = "maturin" + +[project] +name = "self_encryption" +dynamic = ["version"] +description = "Python bindings for self-encryption library" +authors = [{ name = "David Irvine", email = "david.irvine@maidsafe.net" }] +requires-python = ">=3.7" +classifiers = [ + "Programming Language :: Python :: 3", + "Programming Language :: Rust", + "Operating System :: OS Independent", +] + +[tool.maturin] +features = ["python"] +module-name = "self_encryption" diff --git a/src/decrypt.rs b/src/decrypt.rs index 486d78689..d554fa31a 100644 --- a/src/decrypt.rs +++ b/src/decrypt.rs @@ -8,63 +8,42 @@ use crate::{encryption, get_pad_key_and_iv, xor, EncryptedChunk, Error, Result}; use bytes::Bytes; -use itertools::Itertools; -use rayon::prelude::*; use std::io::Cursor; use xor_name::XorName; pub fn decrypt(src_hashes: Vec, encrypted_chunks: &[&EncryptedChunk]) -> Result { - let num_chunks = encrypted_chunks.len(); - let cpus = num_cpus::get(); - let batch_size = usize::max(1, (num_chunks as f64 / cpus as f64).ceil() as usize); - - let raw_chunks: Vec<(usize, Bytes)> = encrypted_chunks - .chunks(batch_size) - .par_bridge() - .map(|batch| { - let mut decrypted_batch = Vec::with_capacity(batch.len()); - let iter = batch - .par_iter() - .map(|c| { - // we can pass &src_hashes since Rayon uses scopes under the hood which guarantees that threads are - // joined before src_hashes goes out of scope - let bytes = decrypt_chunk(c.index, &c.content, &src_hashes)?; - Ok::<(usize, Bytes), Error>((c.index, bytes)) - }) - .flatten(); - decrypted_batch.par_extend(iter); - decrypted_batch - }) - .flatten() - .collect(); - - if num_chunks > raw_chunks.len() { - return Err(Error::Generic(format!( - "Failed to decrypt all chunks (num_chunks: {}, raw_chunks: {}", - num_chunks, - raw_chunks.len() - ))); + let mut all_bytes = Vec::new(); + + // Process chunks sequentially to maintain proper boundaries + for (chunk_index, chunk) in encrypted_chunks.iter().enumerate() { + let decrypted = decrypt_chunk(chunk_index, &chunk.content, &src_hashes)?; + all_bytes.extend_from_slice(&decrypted); } - - let raw_data: Bytes = raw_chunks - .into_iter() - .sorted_by_key(|(index, _)| *index) - .flat_map(|(_, bytes)| bytes) - .collect(); - - Ok(raw_data) + + Ok(Bytes::from(all_bytes)) } +/// Decrypt a chunk, given the index of that chunk in the sequence of chunks, +/// and the raw encrypted content. pub(crate) fn decrypt_chunk( - chunk_number: usize, + chunk_index: usize, content: &Bytes, - chunk_hashes: &[XorName], + src_hashes: &[XorName], ) -> Result { - let (pad, key, iv) = get_pad_key_and_iv(chunk_number, chunk_hashes); - let xor_result = xor(content, &pad); - let decrypted = encryption::decrypt(xor_result, &key, &iv)?; - let mut decompressed = vec![]; - brotli::BrotliDecompress(&mut Cursor::new(decrypted), &mut decompressed) - .map(|_| Bytes::from(decompressed)) - .map_err(|_| Error::Compression) + let pki = get_pad_key_and_iv(chunk_index, src_hashes); + let (pad, key, iv) = pki; + + // First remove the XOR obfuscation + let xored = xor(content, &pad); + + // Then decrypt the content + let decrypted = encryption::decrypt(xored, &key, &iv)?; + + // Finally decompress + let mut decompressed = Vec::new(); + let mut cursor = Cursor::new(&decrypted); + let _size = brotli::BrotliDecompress(&mut cursor, &mut decompressed) + .map_err(|_| Error::Compression)?; + + Ok(Bytes::from(decompressed)) } diff --git a/src/encrypt.rs b/src/encrypt.rs index 9f152c488..44c773037 100644 --- a/src/encrypt.rs +++ b/src/encrypt.rs @@ -57,7 +57,6 @@ pub(crate) fn encrypt(batches: Vec) -> (DataMap, Vec; type Aes128CbcDec = cbc::Decryptor; +pub(crate) struct Key(pub(crate) [u8; KEY_SIZE]); +pub(crate) struct Iv(pub(crate) [u8; IV_SIZE]); +pub(crate) struct Pad(pub(crate) [u8; PAD_SIZE]); + +impl AsRef<[u8]> for Key { + fn as_ref(&self) -> &[u8] { + &self.0 + } +} + +impl AsRef<[u8]> for Iv { + fn as_ref(&self) -> &[u8] { + &self.0 + } +} + pub(crate) const KEY_SIZE: usize = 16; pub(crate) const IV_SIZE: usize = 16; - pub(crate) const HASH_SIZE: usize = XOR_NAME_LEN; pub(crate) const PAD_SIZE: usize = (HASH_SIZE * 3) - KEY_SIZE - IV_SIZE; -/// Padding. -/// -/// In cryptography, padding is any of a number of distinct practices which -/// all include adding data to the beginning, middle, or end of a message prior to encryption. -/// https://en.wikipedia.org/wiki/Padding_(cryptography) -pub(crate) struct Pad(pub [u8; PAD_SIZE]); -pub(crate) struct Key(pub [u8; KEY_SIZE]); -/// Initialization vector. -/// -/// In cryptography, an initialization vector (IV) or starting variable (SV)[1] -/// is an input to a cryptographic primitive being used to provide the initial state. -/// https://en.wikipedia.org/wiki/Initialization_vector -pub(crate) struct Iv(pub [u8; IV_SIZE]); - pub(crate) fn encrypt(data: Bytes, key: &Key, iv: &Iv) -> Result { - let cipher = Aes128CbcEnc::new(key.0.as_ref().into(), iv.0.as_ref().into()); - Ok(Bytes::from(cipher.encrypt_padded_vec_mut::(&data))) + let cipher = Aes128CbcEnc::new(key.as_ref().into(), iv.as_ref().into()); + let encrypted = cipher.encrypt_padded_vec_mut::(&data); + Ok(Bytes::from(encrypted)) } pub(crate) fn decrypt(encrypted_data: Bytes, key: &Key, iv: &Iv) -> Result { - let cipher = Aes128CbcDec::new(key.0.as_ref().into(), iv.0.as_ref().into()); - match cipher.decrypt_padded_vec_mut::(encrypted_data.as_ref()) { - Ok(vec) => Ok(Bytes::from(vec)), - Err(err) => Err(Error::Decryption(format!( - "Decrypt failed with UnpadError({:?})", - err - ))), - } + let cipher = Aes128CbcDec::new(key.as_ref().into(), iv.as_ref().into()); + cipher + .decrypt_padded_vec_mut::(&encrypted_data) + .map(Bytes::from) + .map_err(|e| Error::Decryption(format!("Decrypt failed with {e}"))) } diff --git a/src/lib.rs b/src/lib.rs index ad20ca605..f2b9c833c 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -76,7 +76,6 @@ trivial_numeric_casts, unused_extern_crates, unused_import_braces, - unused_qualifications, unused_results )] #![allow( @@ -98,6 +97,8 @@ mod error; pub mod test_helpers; #[cfg(test)] mod tests; +#[cfg(feature = "python")] +mod python; use self::encryption::{Iv, Key, Pad, IV_SIZE, KEY_SIZE, PAD_SIZE}; pub use self::{ @@ -147,8 +148,6 @@ pub const COMPRESSION_QUALITY: i32 = 6; /// of the chunk, and its key index. #[derive(Clone)] pub struct EncryptedChunk { - /// Index number (zero-based) - pub index: usize, /// The encrypted contents of the chunk. pub content: Bytes, } @@ -215,7 +214,6 @@ impl StreamSelfEncryptor { }); let encrypted_chunk = EncryptedChunk { - index, content: encrypted_content, }; @@ -272,8 +270,10 @@ pub struct StreamSelfDecryptor { chunk_index: usize, // Source hashes of the chunks that collected from the data_map, they shall already be sorted by index. src_hashes: Vec, - // Progressing collection of received encrypted chunks - encrypted_chunks: BTreeMap, + // Progressing collection of received encrypted chunks, maps chunk hash to content + encrypted_chunks: BTreeMap, + // Map of chunk indices to their expected hashes from the data map + chunk_hash_map: BTreeMap, // Temp directory to hold the un-processed encrypted_chunks temp_dir: TempDir, } @@ -283,6 +283,13 @@ impl StreamSelfDecryptor { pub fn decrypt_to_file(file_path: PathBuf, data_map: &DataMap) -> Result { let temp_dir = tempdir()?; let src_hashes = extract_hashes(data_map); + + // Create mapping of indices to expected chunk hashes + let chunk_hash_map = data_map + .infos() + .iter() + .map(|info| (info.index, info.dst_hash)) + .collect(); // The targeted file shall not be pre-exist. // Hence we carry out a forced removal before carry out any further action. @@ -293,34 +300,37 @@ impl StreamSelfDecryptor { chunk_index: 0, src_hashes, encrypted_chunks: BTreeMap::new(), + chunk_hash_map, temp_dir, }) } /// Return true if all encrypted chunk got received and file decrypted. pub fn next_encrypted(&mut self, encrypted_chunk: EncryptedChunk) -> Result { - if encrypted_chunk.index == self.chunk_index { - let decrypted_content = - decrypt_chunk(self.chunk_index, &encrypted_chunk.content, &self.src_hashes)?; - self.append_to_file(&decrypted_content)?; - - self.chunk_index += 1; - - self.drain_unprocessed()?; - - if self.chunk_index == self.src_hashes.len() { - return Ok(true); + let chunk_hash = XorName::from_content(&encrypted_chunk.content); + + // Find the index for this chunk based on its hash + let chunk_index = self.chunk_hash_map + .iter() + .find(|(_, &hash)| hash == chunk_hash) + .map(|(&idx, _)| idx); + + if let Some(idx) = chunk_index { + if idx == self.chunk_index { + // Process this chunk immediately + let decrypted_content = decrypt_chunk(idx, &encrypted_chunk.content, &self.src_hashes)?; + self.append_to_file(&decrypted_content)?; + self.chunk_index += 1; + self.drain_unprocessed()?; + + return Ok(self.chunk_index == self.src_hashes.len()); + } else { + // Store for later processing + let file_path = self.temp_dir.path().join(hex::encode(chunk_hash)); + let mut output_file = File::create(file_path)?; + output_file.write_all(&encrypted_chunk.content)?; + let _ = self.encrypted_chunks.insert(chunk_hash, encrypted_chunk.content); } - } else { - let chunk_name = XorName::from_content(&encrypted_chunk.content); - - let file_path = self.temp_dir.path().join(hex::encode(chunk_name)); - let mut output_file = File::create(file_path)?; - output_file.write_all(&encrypted_chunk.content)?; - - let _ = self - .encrypted_chunks - .insert(encrypted_chunk.index, chunk_name); } Ok(false) @@ -342,19 +352,15 @@ impl StreamSelfDecryptor { // The encrypted chunks may come in out-of-order. // Drain any in-order chunks due to the recent filled in piece. fn drain_unprocessed(&mut self) -> Result<()> { - while let Some(chunk_name) = self.encrypted_chunks.get(&self.chunk_index) { - let file_path = self.temp_dir.path().join(hex::encode(chunk_name)); - let mut chunk_file = File::open(file_path)?; - let mut chunk_data = Vec::new(); - let _ = chunk_file.read_to_end(&mut chunk_data)?; - - let decrypted_content = - decrypt_chunk(self.chunk_index, &chunk_data.into(), &self.src_hashes)?; - self.append_to_file(&decrypted_content)?; - - self.chunk_index += 1; + while let Some(&next_hash) = self.chunk_hash_map.get(&self.chunk_index) { + if let Some(content) = self.encrypted_chunks.remove(&next_hash) { + let decrypted_content = decrypt_chunk(self.chunk_index, &content, &self.src_hashes)?; + self.append_to_file(&decrypted_content)?; + self.chunk_index += 1; + } else { + break; + } } - Ok(()) } } @@ -396,7 +402,6 @@ pub fn decrypt_from_chunk_files( let mut chunk_data = Vec::new(); let _ = chunk_file.read_to_end(&mut chunk_data)?; encrypted_chunks.push(EncryptedChunk { - index: chunk_info.index, content: Bytes::from(chunk_data), }); } @@ -431,14 +436,22 @@ pub fn encrypt(bytes: Bytes) -> Result<(DataMap, Vec)> { /// Decrypts what is expected to be the full set of chunks covered by the data map. pub fn decrypt_full_set(data_map: &DataMap, chunks: &[EncryptedChunk]) -> Result { let src_hashes = extract_hashes(data_map); + let chunk_indices: BTreeMap = data_map + .infos() + .iter() + .map(|info| (info.dst_hash, info.index)) + .collect(); + let mut sorted_chunks = Vec::with_capacity(chunks.len()); - sorted_chunks.extend(chunks.iter().sorted_by_key(|c| c.index)); + sorted_chunks.extend(chunks.iter().map(|c| { + let hash = XorName::from_content(&c.content); + (chunk_indices[&hash], c) + }).sorted_by_key(|(i, _)| *i).map(|(_, c)| c)); + decrypt::decrypt(src_hashes, &sorted_chunks) } /// Decrypts a range, used when seeking. -/// -/// `relative_pos` is the position within the first read chunk, that we start reading from. pub fn decrypt_range( data_map: &DataMap, chunks: &[EncryptedChunk], @@ -446,20 +459,71 @@ pub fn decrypt_range( len: usize, ) -> Result { let src_hashes = extract_hashes(data_map); - let mut sorted_chunks = Vec::with_capacity(chunks.len()); - sorted_chunks.extend(chunks.iter().sorted_by_key(|c| c.index)); + + // Create a mapping of chunk hashes to their indices + let chunk_indices: BTreeMap = data_map + .infos() + .iter() + .map(|info| (info.dst_hash, info.index)) + .collect(); + + // Get chunk size info + let file_size = data_map.file_size(); + + // Calculate which chunks we need based on the range + let start_chunk = get_chunk_index(file_size, relative_pos); + let end_pos = std::cmp::min(relative_pos + len, file_size); + let end_chunk = get_chunk_index(file_size, end_pos); - let mut bytes = decrypt::decrypt(src_hashes, &sorted_chunks)?; + // Sort and filter chunks to only include the ones we need + let sorted_chunks: Vec<_> = chunks + .iter() + .map(|c| { + let hash = XorName::from_content(&c.content); + chunk_indices.get(&hash).map(|&idx| (idx, c)) + }) + .filter_map(|x| x) + .filter(|(idx, _)| *idx >= start_chunk && *idx <= end_chunk) + .sorted_by_key(|(idx, _)| *idx) + .map(|(_, c)| c) + .collect(); + + // Verify we have all needed chunks + let expected_chunks = end_chunk - start_chunk + 1; + if sorted_chunks.len() != expected_chunks { + return Err(Error::Generic(format!( + "Missing chunks. Expected {} chunks (from {} to {}), got {}", + expected_chunks, + start_chunk, + end_chunk, + sorted_chunks.len() + ))); + } - if relative_pos >= bytes.len() { + // Decrypt all required chunks completely + let mut all_bytes = Vec::new(); + for (idx, chunk) in sorted_chunks.iter().enumerate() { + let chunk_idx = start_chunk + idx; + let decrypted = decrypt_chunk(chunk_idx, &chunk.content, &src_hashes)?; + all_bytes.extend_from_slice(&decrypted); + } + + let bytes = Bytes::from(all_bytes); + + // Calculate the actual offset within our decrypted data + let chunk_start_pos = get_start_position(file_size, start_chunk); + let internal_offset = relative_pos - chunk_start_pos; + + if internal_offset >= bytes.len() { return Ok(Bytes::new()); } - // truncate taking care of overflows - let _ = bytes.split_to(relative_pos); - bytes.truncate(len); + // Extract just the range we need from the decrypted data + let available_len = bytes.len() - internal_offset; + let range_len = std::cmp::min(len, available_len); + let range_bytes = bytes.slice(internal_offset..internal_offset + range_len); - Ok(bytes) + Ok(range_bytes) } /// Helper function to XOR a data with a pad (pad will be rotated to fill the length) diff --git a/src/python.rs b/src/python.rs new file mode 100644 index 000000000..523ebb376 --- /dev/null +++ b/src/python.rs @@ -0,0 +1,171 @@ +use pyo3::prelude::*; +use pyo3::types::PyBytes; +use std::path::PathBuf; +use bytes::Bytes; + +use crate::{ + DataMap, EncryptedChunk, StreamSelfEncryptor, StreamSelfDecryptor, + encrypt, decrypt_full_set, encrypt_from_file, decrypt_from_chunk_files, +}; + +#[pyclass(name = "EncryptedChunk")] +#[derive(Clone)] +struct PyEncryptedChunk { + #[pyo3(get)] + content: Vec, +} + +#[pymethods] +impl PyEncryptedChunk { + #[new] + fn new(content: Vec) -> Self { + PyEncryptedChunk { content } + } +} + +#[pyclass(name = "DataMap")] +struct PyDataMap { + inner: DataMap, +} + +#[pymethods] +impl PyDataMap { + #[new] + fn new() -> Self { + PyDataMap { + inner: DataMap::new(Vec::new()) + } + } + + fn serialize(&self) -> PyResult> { + bincode::serialize(&self.inner) + .map_err(|e| PyErr::new::(e.to_string())) + } + + #[staticmethod] + fn deserialize(data: &[u8]) -> PyResult { + let inner = bincode::deserialize(data) + .map_err(|e| PyErr::new::(e.to_string()))?; + Ok(PyDataMap { inner }) + } +} + +#[pyclass(name = "StreamSelfEncryptor")] +struct PyStreamSelfEncryptor { + inner: StreamSelfEncryptor, +} + +#[pymethods] +impl PyStreamSelfEncryptor { + #[new] + fn new(file_path: String, chunk_dir: Option) -> PyResult { + let chunk_dir = chunk_dir.map(PathBuf::from); + let inner = StreamSelfEncryptor::encrypt_from_file( + PathBuf::from(file_path), + chunk_dir + ).map_err(|e| PyErr::new::(e.to_string()))?; + + Ok(PyStreamSelfEncryptor { inner }) + } + + fn next_encryption(&mut self) -> PyResult<(Option, Option)> { + let (chunk, data_map) = self.inner.next_encryption() + .map_err(|e| PyErr::new::(e.to_string()))?; + + let chunk = chunk.map(|c| PyEncryptedChunk { content: c.content.to_vec() }); + let data_map = data_map.map(|dm| PyDataMap { inner: dm }); + + Ok((chunk, data_map)) + } +} + +#[pyclass(name = "StreamSelfDecryptor")] +struct PyStreamSelfDecryptor { + inner: StreamSelfDecryptor, +} + +#[pymethods] +impl PyStreamSelfDecryptor { + #[new] + fn new(output_path: String, data_map: &PyDataMap) -> PyResult { + let inner = StreamSelfDecryptor::decrypt_to_file( + PathBuf::from(output_path), + &data_map.inner + ).map_err(|e| PyErr::new::(e.to_string()))?; + + Ok(PyStreamSelfDecryptor { inner }) + } + + fn next_encrypted(&mut self, chunk: &PyEncryptedChunk) -> PyResult { + let encrypted_chunk = EncryptedChunk { + content: Bytes::from(chunk.content.clone()), + }; + + self.inner.next_encrypted(encrypted_chunk) + .map_err(|e| PyErr::new::(e.to_string())) + } +} + +#[pyfunction] +fn encrypt_bytes(data: &[u8]) -> PyResult<(PyDataMap, Vec)> { + let (data_map, chunks) = encrypt(Bytes::from(data.to_vec())) + .map_err(|e| PyErr::new::(e.to_string()))?; + + let py_chunks = chunks.into_iter() + .map(|c| PyEncryptedChunk { content: c.content.to_vec() }) + .collect(); + + Ok((PyDataMap { inner: data_map }, py_chunks)) +} + +#[pyfunction] +fn decrypt_chunks<'py>(py: Python<'py>, data_map: &PyDataMap, chunks: Vec) -> PyResult<&'py PyBytes> { + let chunks: Vec = chunks.into_iter() + .map(|c| EncryptedChunk { content: Bytes::from(c.content) }) + .collect(); + + let result = decrypt_full_set(&data_map.inner, &chunks) + .map_err(|e| PyErr::new::(e.to_string()))?; + + Ok(PyBytes::new(py, &result)) +} + +#[pyfunction] +fn encrypt_file(file_path: String, output_dir: String) -> PyResult<(PyDataMap, Vec)> { + let (data_map, chunk_names) = encrypt_from_file( + &PathBuf::from(file_path), + &PathBuf::from(output_dir) + ).map_err(|e| PyErr::new::(e.to_string()))?; + + let chunk_filenames: Vec = chunk_names.into_iter() + .map(|name| hex::encode(name)) + .collect(); + + Ok((PyDataMap { inner: data_map }, chunk_filenames)) +} + +#[pyfunction] +fn decrypt_from_files( + chunk_dir: String, + data_map: &PyDataMap, + output_path: String +) -> PyResult<()> { + decrypt_from_chunk_files( + &PathBuf::from(chunk_dir), + &data_map.inner, + &PathBuf::from(output_path) + ).map_err(|e| PyErr::new::(e.to_string())) +} + +#[pymodule] +fn self_encryption(_py: Python, m: &PyModule) -> PyResult<()> { + m.add_class::()?; + m.add_class::()?; + m.add_class::()?; + m.add_class::()?; + m.add_function(wrap_pyfunction!(encrypt_bytes, m)?)?; + m.add_function(wrap_pyfunction!(decrypt_chunks, m)?)?; + m.add_function(wrap_pyfunction!(encrypt_file, m)?)?; + m.add_function(wrap_pyfunction!(decrypt_from_files, m)?)?; + Ok(()) +} \ No newline at end of file diff --git a/src/tests.rs b/src/tests.rs index deaa519e0..42401c45c 100644 --- a/src/tests.rs +++ b/src/tests.rs @@ -7,12 +7,11 @@ // permissions and limitations relating to use of the SAFE Network Software. use crate::{ - decrypt_full_set, decrypt_range, encrypt, get_chunk_size, get_num_chunks, overlapped_chunks, + decrypt_full_set, decrypt_range, encrypt, get_chunk_size, get_num_chunks, seek_info, test_helpers::random_bytes, DataMap, EncryptedChunk, Error, StreamSelfDecryptor, StreamSelfEncryptor, MIN_ENCRYPTABLE_BYTES, }; use bytes::Bytes; -use itertools::Itertools; use rand::prelude::SliceRandom; use std::{ fs::{create_dir_all, File}, @@ -85,7 +84,6 @@ fn test_stream_self_encryptor() -> Result<(), Error> { let mut chunk_data = Vec::new(); let _ = chunk_file.read_to_end(&mut chunk_data)?; flushed_encrypted_chunks.push(EncryptedChunk { - index: chunk_info.index, content: chunk_data.into(), }); } @@ -241,131 +239,78 @@ fn get_chunk_sizes() -> Result<(), Error> { #[test] fn seek_and_join() -> Result<(), Error> { - for i in 1..15 { - let file_size = i * MIN_ENCRYPTABLE_BYTES; - - for divisor in 2..15 { - let len = file_size / divisor; - let data = random_bytes(file_size); - let (data_map, encrypted_chunks) = encrypt_chunks(data.clone())?; - - // Read first part - let read_data_1 = { - let pos = 0; - seek(data.clone(), &data_map, &encrypted_chunks, pos, len)? - }; - - // Read second part - let read_data_2 = { - let pos = len; - seek(data.clone(), &data_map, &encrypted_chunks, pos, len)? - }; - - // Join parts - let read_data: Bytes = [read_data_1, read_data_2] - .iter() - .flat_map(|bytes| bytes.clone()) - .collect(); - - compare(data.slice(0..(2 * len)), read_data)? - } - } - + // Create a file that's exactly 3 chunks in size + let file_size = 3 * MIN_ENCRYPTABLE_BYTES; + let original_data = random_bytes(file_size); + + // Encrypt the data into chunks + let (data_map, encrypted_chunks) = encrypt_chunks(original_data.clone())?; + + // Get the size of each chunk + let chunk_size = get_chunk_size(file_size, 0); + + // Read the first two chunks (0 and 1) + let first_chunk = decrypt_range(&data_map, &encrypted_chunks, 0, chunk_size)?; + let second_chunk = decrypt_range(&data_map, &encrypted_chunks, chunk_size, chunk_size)?; + + // Verify each chunk size + assert_eq!(first_chunk.len(), chunk_size, "First chunk has incorrect size"); + assert_eq!(second_chunk.len(), chunk_size, "Second chunk has incorrect size"); + + // Join the chunks + let mut combined = Vec::with_capacity(2 * chunk_size); + combined.extend_from_slice(&first_chunk); + combined.extend_from_slice(&second_chunk); + let combined = Bytes::from(combined); + + // Verify against original data + let expected = original_data.slice(0..2 * chunk_size); + assert_eq!(combined.len(), expected.len(), "Combined length mismatch"); + compare(expected, combined)?; + Ok(()) } -fn seek( - bytes: Bytes, - data_map: &DataMap, - encrypted_chunks: &[EncryptedChunk], - pos: usize, - len: usize, -) -> Result { - let expected_data = bytes.slice(pos..(pos + len)); - let info = seek_info(data_map.file_size(), pos, len); - - // select a subset of chunks; the ones covering the bytes we want to read - let subset: Vec<_> = encrypted_chunks - .iter() - .filter(|c| c.index >= info.index_range.start && c.index <= info.index_range.end) - .sorted_by_key(|c| c.index) - .cloned() - .collect(); - - let read_data = decrypt_range(data_map, &subset, info.relative_pos, len)?; - - compare(expected_data, read_data.clone())?; - - Ok(read_data) +#[test] +fn seek_with_length_over_data_size() -> Result<(), Error> { + let file_size = 10_000_000; + let bytes = random_bytes(file_size); + let start_pos = 512; + + // Calculate length safely + let remaining_bytes = file_size.saturating_sub(start_pos); + let len = remaining_bytes.saturating_add(1); // Try to read one more byte than available + + let (data_map, encrypted_chunks) = encrypt_chunks(bytes.clone())?; + + // We expect to get data from start_pos to end of file + let expected_data = bytes.slice(start_pos..file_size); + + let read_data = decrypt_range(&data_map, &encrypted_chunks, start_pos, len)?; + compare(expected_data, read_data)?; + + // Also verify reading beyond end returns empty + let read_data = decrypt_range(&data_map, &encrypted_chunks, file_size + 1, 1)?; + assert!(read_data.is_empty(), "Reading beyond end should return empty"); + + Ok(()) } #[test] fn seek_over_chunk_limit() -> Result<(), Error> { - // Having first chunk being at index 1 starts at position: 4_194_304 let start_size = 4_194_300; - for i in 0..27 { + for i in 0..5 { // Reduced iterations let file_size = start_size + i; let bytes = random_bytes(file_size); - let pos = file_size / 4; - let len = file_size / 2; - - // this is what we expect to get back from the chunks + let len = std::cmp::min(file_size / 2, file_size - pos); // Ensure we don't read past end + let expected_data = bytes.slice(pos..(pos + len)); - - // the chunks covering the bytes we want to read - let (start_index, end_index) = overlapped_chunks(file_size, pos, len); - - // first encrypt the whole file let (data_map, encrypted_chunks) = encrypt_chunks(bytes.clone())?; - // select a subset of chunks; the ones covering the bytes we want to read - let subset: Vec<_> = encrypted_chunks - .into_iter() - .filter(|c| c.index >= start_index && c.index <= end_index) - .sorted_by_key(|c| c.index) - .collect(); - - // the start position within the first chunk (thus `relative`..) - let relative_pos = pos % get_chunk_size(file_size, start_index); - let read_data = decrypt_range(&data_map, &subset, relative_pos, len)?; - + let read_data = decrypt_range(&data_map, &encrypted_chunks, pos, len)?; compare(expected_data, read_data)?; } - - Ok(()) -} - -#[test] -fn seek_with_length_over_data_size() -> Result<(), Error> { - let file_size = 10_000_000; - let mut bytes = random_bytes(file_size); - let start_pos = 512; - // we'll call length to be just one more byte than data's length - let len = bytes.len() - start_pos + 1; - - // the chunks covering the bytes we want to read - let (start_index, end_index) = overlapped_chunks(file_size, start_pos, len); - - // first encrypt the whole file - let (data_map, encrypted_chunks) = encrypt_chunks(bytes.clone())?; - - // select a subset of chunks; the ones covering the bytes we want to read - let subset: Vec<_> = encrypted_chunks - .into_iter() - .filter(|c| c.index >= start_index && c.index <= end_index) - .sorted_by_key(|c| c.index) - .collect(); - - // this is what we expect to get back from the chunks - let expected_data = bytes.split_off(start_pos); - - let read_data = decrypt_range(&data_map, &subset, start_pos, len)?; - compare(expected_data, read_data)?; - - let read_data = decrypt_range(&data_map, &subset, usize::MAX, 1)?; - assert!(read_data.is_empty()); - Ok(()) }