Skip to content

Commit

Permalink
Simplify crates.io API caching
Browse files Browse the repository at this point in the history
Previously, we would perform time-based caching for data from the
crates.io API in order to avoid unnecessary load on it. Unfortunately,
this would sometimes lead to situations such as #629 where the cache is
out-of-date and confusing errors are produced.

Since the last time we looked at this, the API guidance for the sparse
HTTP index has been clarified, explicitly having "no rate limit". This
allows us to simplify the caching code dramatically, caching only the
data from the rate-limited crates.io API, and always checking the sparse
HTTP index to see if new versions have been published.

The new caching system no longer needs expiry times, as it will be
updated consistently whenever a new version is published to the
crates.io index.

After this change, we will make more network requests than before when
running cargo-vet to check the index.
  • Loading branch information
mystor committed Sep 3, 2024
1 parent 0f4db3c commit 9887560
Show file tree
Hide file tree
Showing 6 changed files with 305 additions and 581 deletions.
33 changes: 14 additions & 19 deletions src/format.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ use core::{cmp, fmt};
use std::collections::{BTreeMap, BTreeSet, HashMap, HashSet};
use std::path::PathBuf;
use std::str::FromStr;
use std::sync::Arc;

use cargo_metadata::{semver, Package};
use serde::{de, de::Visitor, Deserialize, Deserializer, Serialize, Serializer};
Expand Down Expand Up @@ -94,6 +95,16 @@ impl VetVersion {
pub fn equals_semver(&self, semver: &semver::Version) -> bool {
self.git_rev.is_none() && &self.semver == semver
}

/// Get this VetVersion as a semver::Version, returning None if this version
/// corresponds to a git revision.
pub fn as_semver(&self) -> Option<&semver::Version> {
if self.git_rev.is_none() {
Some(&self.semver)
} else {
None
}
}
}
impl fmt::Display for VetVersion {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
Expand Down Expand Up @@ -1178,30 +1189,14 @@ pub struct CratesCacheVersionDetails {

#[derive(Serialize, Deserialize, Default, Debug, Clone)]
pub struct CratesCacheEntry {
pub last_fetched: chrono::DateTime<chrono::Utc>,
/// If versions is empty, this indicates that we queried the info and found that the crate has
/// no published versions (and thus doesn't exist as of `last_fetched`).
///
/// Versions are a sorted map because we sometimes need to iterate in order. We don't use a sorted
/// Vec because we may partially update the versions when we access the index (though technically
/// that update _should_ only have new versions which would append to a Vec if it were that).
pub versions: SortedMap<semver::Version, Option<CratesCacheVersionDetails>>,
pub metadata: Option<CratesAPICrateMetadata>,
pub versions: SortedMap<semver::Version, CratesCacheVersionDetails>,
pub metadata: CratesAPICrateMetadata,
}

#[derive(Serialize, Deserialize, Clone, Default)]
pub struct CratesCache {
pub users: SortedMap<CratesUserId, CratesCacheUser>,
pub crates: SortedMap<PackageName, CratesCacheEntry>,
}

impl CratesCacheEntry {
/// Return whether the crate exists.
///
/// The cache stores non-existent results when fetching.
pub fn exists(&self) -> bool {
!self.versions.is_empty()
}
pub crates: SortedMap<PackageName, Arc<CratesCacheEntry>>,
}

////////////////////////////////////////////////////////////////////////////////////
Expand Down
54 changes: 21 additions & 33 deletions src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1962,26 +1962,23 @@ async fn fix_audit_as(
// false positives, but is certainly a bit of a loose
// comparison. If it turns out to be an issue we can
// improve it in the future.
let default_audit_as = if network.is_some() {
// NOTE: Handle all errors silently here, as we can always recover by
// setting `audit-as-crates-io = false`. The error cases below are very
// unlikely to occur since information will be cached from the initial
// checks which generated the NeedsAuditAsErrors.
let crates_api_metadata =
match cache.get_crate_metadata(network, &err.package).await {
Ok(v) => v,
Err(e) => {
warn!("crate metadata error for {}: {e}", &err.package);
Default::default()
}
};

cfg.metadata.packages.iter().any(|p| {
p.name == err.package && crates_api_metadata.consider_as_same(p)
})
} else {
false
};
//
// NOTE: Handle all errors silently here, as we can
// always recover by setting `audit-as-crates-io =
// false`. The error cases below are very unlikely to
// occur since information will be cached from the
// initial checks which generated the
// NeedsAuditAsErrors.
let default_audit_as =
match cache.crates_io_info(network, &err.package).await {
Ok(entry) => cfg.metadata.packages.iter().any(|p| {
p.name == err.package && entry.metadata.consider_as_same(p)
}),
Err(e) => {
warn!("crate metadata error for {}: {e}", &err.package);
false
}
};

get_policy_entry(store, cfg, &third_party_packages, &err)
.audit_as_crates_io = Some(default_audit_as);
Expand Down Expand Up @@ -2937,19 +2934,10 @@ async fn check_audit_as_crates_io(
return None;
}

// To avoid unnecessary metadata lookup, only do so for packages which exist in the
// index. The case which doesn't work with this logic is if someone is using a
// package before it has ever been published, and then later it is published (in
// which case a third-party change causes a warning to unexpectedly come up).
// However, this case is sufficiently unlikely that for now it's worth the initial
// lookup to avoid unnecessarily trying to fetch metadata for unpublished crates.
//
// The caching logic already does this for us as an optimization, but since we may
// need to look at the specific versions later, we fetch it anyway.
let mut matches_crates_io_package = false;
if let Ok(metadata) = cache.get_crate_metadata(network, &package.name).await {
matches_crates_io_package = metadata.consider_as_same(package);
}
let matches_crates_io_package = cache
.crates_io_info(network, &package.name)
.await
.map_or(false, |entry| entry.metadata.consider_as_same(package));

if matches_crates_io_package && audit_policy.is_none() {
// We found a package that has similar metadata to one with the same name
Expand Down
68 changes: 32 additions & 36 deletions src/resolver.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1768,30 +1768,20 @@ impl<'a> ResolveReport<'a> {
// Attempt to look up the publisher of the target version
// for the suggested diff, and also record whether the given
// package has a sole publisher.
let mut is_sole_publisher = false;
let publisher_id =
if let (Some(network), None) = (&network, &suggested_diff.to.git_rev) {
let versions = cache
.get_publishers(
Some(network),
package.name,
[&suggested_diff.to.semver].into_iter().collect(),
)
.await
.unwrap_or_default();
let publisher_count = versions
.iter()
.flat_map(|(_, details)| &details.published_by)
.collect::<FastSet<_>>()
.len();
is_sole_publisher = publisher_count == 1;
versions
.into_iter()
.find(|(v, _)| v == &suggested_diff.to.semver)
.and_then(|(_, d)| d.published_by)
} else {
None
};
let crates_io_info = cache.crates_io_info(network, package.name).await.ok();
let publisher_id = match (suggested_diff.to.as_semver(), &crates_io_info) {
(Some(semver), Some(metadata)) => metadata
.versions
.get(semver)
.and_then(|details| details.published_by),
_ => None,
};
let publisher_count = crates_io_info
.iter()
.flat_map(|m| m.versions.values())
.flat_map(|details| &details.published_by)
.collect::<FastSet<_>>()
.len();

// Compute the trust hint, which is the information used to generate "consider
// cargo trust FOO" messages. There can be multiple potential hints, but we
Expand All @@ -1809,12 +1799,14 @@ impl<'a> ResolveReport<'a> {
exact_version = true;
publisher_id
} else if !store.audits.trusted.contains_key(package.name) {
cache
.get_cached_publishers(package.name)
.iter()
.rev()
.filter_map(|(_, details)| details.published_by)
.find(|i| trusted_publishers.contains_key(i))
crates_io_info.as_ref().and_then(|metadata| {
metadata
.versions
.iter()
.rev()
.filter_map(|(_, details)| details.published_by)
.find(|i| trusted_publishers.contains_key(i))
})
} else {
None
};
Expand Down Expand Up @@ -1930,7 +1922,7 @@ impl<'a> ResolveReport<'a> {
notable_parents: notable_parents.clone(),
publisher_login,
trust_hint,
is_sole_publisher,
is_sole_publisher: publisher_count == 1,
registry_suggestion,
}])
.collect()
Expand Down Expand Up @@ -2555,7 +2547,11 @@ async fn suggest_delta(
) -> Option<(DiffRecommendation, Option<DiffRecommendation>)> {
// Fetch the set of known versions from crates.io so we know which versions
// we'll have sources for.
let known_versions = cache.get_versions(network, package_name).await.ok();
let known_versions = if let Some(network) = network {
cache.published_versions(network, package_name).await.ok()
} else {
None
};

// Collect up the details of how we failed
struct Reachable<'a> {
Expand Down Expand Up @@ -2590,11 +2586,11 @@ async fn suggest_delta(
}
// We have sources if the version has been published to crates.io.
//
// For testing fallbacks, assume we always have sources if the
// index is unavailable.
// For testing fallbacks or when we're offline, assume we always
// have sources if the index is unavailable.
known_versions
.as_ref()
.map_or(true, |versions| versions.contains(&ver.semver))
.map_or(true, |versions| versions.contains_key(&ver.semver))
};
reachable = Some(Reachable {
from_root: reachable_from_root
Expand Down Expand Up @@ -2640,7 +2636,7 @@ async fn suggest_delta(
// encourage auditing first.
let closest_below = if let Some(known_versions) = &known_versions {
known_versions
.iter()
.keys()
.filter(|&v| v <= &package_version.semver)
.max()
} else {
Expand Down
Loading

0 comments on commit 9887560

Please sign in to comment.