Skip to content

Commit

Permalink
Clean up JSON and SARIF report format
Browse files Browse the repository at this point in the history
  • Loading branch information
bradlarsen committed Aug 15, 2023
1 parent f724a82 commit 47a46a1
Show file tree
Hide file tree
Showing 4 changed files with 81 additions and 63 deletions.
11 changes: 7 additions & 4 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -33,17 +33,20 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
- The Git repository cloning behavior in the `scan` command can now be controlled with the new `--git-clone-mode {mirror,bare}` parameter.

- The `scan` command now collects additional metadata about blobs.
This metadata includes size in bytes, guessed mime type based on filename extension.
This metadata includes size in bytes and guessed mime type based on filename extension.
Optionally, if the non-default `libmagic` Cargo feature is enabled, the mime type and charset are guessed by passing the content of the blob through `libmagic` (the guts of the `file` command-line program).

Additionally, for each blob found in Git repository history, the set of commits where it was introduced and the accompanying pathname for the blob is collected ([#16](https://github.com/praetorian-inc/noseyparker/issues/16)).
This can be controlled using the new `--git-blob-provenance={first-seen,minimal}` parameter.

By default, all this additional metadata is recorded into the datastore for each blob in which matches are found.
This can be more precisely controlled using the new `--blob-metadata={all,matching,none}` parameter.

This newly-collected metadata is included in output of the `report` command.

- The `scan` command now collects additional metadata about blobs found within Git repositories.
Specifically, for each blob found in Git repository history, the set of commits where it was introduced and the accompanying pathname for the blob is collected ([#16](https://github.com/praetorian-inc/noseyparker/issues/16)).
This is enabled by default, but can be controlled using the new `--git-blob-provenance={first-seen,minimal}` parameter.

This newly-collected metadata is included in output of the `report` command.

### Changes
- Existing rules were modified to reduce both false positives and false negatives:

Expand Down
120 changes: 64 additions & 56 deletions crates/noseyparker-cli/src/bin/noseyparker/cmd_report.rs
Original file line number Diff line number Diff line change
Expand Up @@ -130,57 +130,67 @@ impl Reportable for DetailsReporter {
// Will store every match location for the runs.results.location array property
let locations: Vec<sarif::Location> = matches
.into_iter()
.map(|ReportMatch { ps, md, m }| {
let source_span = &m.location.source_span;
// let offset_span = &m.location.offset_span;

// FIXME: rework for the expanded git provenance data
let uri = "FIXME: rework for the expanded git provenance data".to_string();

let properties = sarif::PropertyBagBuilder::default()
.additional_properties([
(String::from("mime_essence"), serde_json::json!(md.mime_essence)),
(String::from("charset"), serde_json::json!(md.charset)),
(String::from("num_bytes"), serde_json::json!(md.num_bytes)),
])
.build()?;

let location = sarif::LocationBuilder::default()
.physical_location(
sarif::PhysicalLocationBuilder::default()
.artifact_location(
sarif::ArtifactLocationBuilder::default()
.uri(uri)
.build()?,
)
// .context_region() FIXME: fill this in with location info of surrounding context
.region(
sarif::RegionBuilder::default()
.start_line(source_span.start.line as i64)
.start_column(source_span.start.column as i64)
.end_line(source_span.end.line as i64)
.end_column(source_span.end.column as i64 + 1)
// FIXME: including byte offsets seems to confuse VSCode SARIF Viewer. Why?
/*
.byte_offset(offset_span.start as i64)
.byte_length(offset_span.len() as i64)
*/
.snippet(
sarif::ArtifactContentBuilder::default()
.text(m.snippet.matching.to_string())
.build()?,
)
.build()?,
)
.build()?,
)
.logical_locations([sarif::LogicalLocationBuilder::default()
.kind("blob")
.name(m.blob_id.to_string())
.properties(properties)
.build()?])
.build()?;
Ok(location)
.flat_map(|ReportMatch { ps, md, m }| {
ps.into_iter().map(move |p| {
let source_span = &m.location.source_span;
// let offset_span = &m.location.offset_span;

let mut properties = sarif::PropertyBagBuilder::default();
properties.additional_properties([
(String::from("blob_metadata"), serde_json::json!(md)),
]);

let uri = match p {
Provenance::File(e) => e.path.to_string_lossy().into_owned(),
Provenance::GitRepo(e) => {
if let Some(p) = e.commit_provenance {
properties.additional_properties([
(String::from("commit_provenance"), serde_json::json!(p)),
]);
}
e.repo_path.to_string_lossy().into_owned()
}
};

let properties = properties.build()?;

let location = sarif::LocationBuilder::default()
.physical_location(
sarif::PhysicalLocationBuilder::default()
.artifact_location(
sarif::ArtifactLocationBuilder::default()
.uri(uri)
.build()?,
)
// .context_region() FIXME: fill this in with location info of surrounding context
.region(
sarif::RegionBuilder::default()
.start_line(source_span.start.line as i64)
.start_column(source_span.start.column as i64)
.end_line(source_span.end.line as i64)
.end_column(source_span.end.column as i64 + 1)
// FIXME: including byte offsets seems to confuse VSCode SARIF Viewer. Why?
/*
.byte_offset(offset_span.start as i64)
.byte_length(offset_span.len() as i64)
*/
.snippet(
sarif::ArtifactContentBuilder::default()
.text(m.snippet.matching.to_string())
.build()?,
)
.build()?,
)
.build()?,
)
.logical_locations([sarif::LogicalLocationBuilder::default()
.kind("blob")
.name(m.blob_id.to_string())
.properties(properties)
.build()?])
.build()?;
Ok(location)
})
})
.collect::<Result<_>>()?;

Expand Down Expand Up @@ -390,7 +400,9 @@ impl Display for MatchGroup {
if let Some(cs) = &e.commit_provenance {
let cmd = &cs.commit_metadata;
let msg = BStr::new(cmd.message.lines().next().unwrap_or(&[]));
let ctime = cmd.committer_timestamp.format(time::macros::format_description!("[year]-[month]-[day]"));
let ctime = cmd
.committer_timestamp
.format(time::macros::format_description!("[year]-[month]-[day]"));
writeln!(
f,
"{} {} in {}",
Expand All @@ -405,18 +417,14 @@ impl Display for MatchGroup {
{} {} <{}>\n\
{} {}\n\
{} {}",

STYLE_HEADING.apply_to("Author:"),
cmd.author_name,
cmd.author_email,

STYLE_HEADING.apply_to("Committer:"),
cmd.committer_name,
cmd.committer_email,

STYLE_HEADING.apply_to("Date:"),
ctime,

STYLE_HEADING.apply_to("Summary:"),
msg,
)?;
Expand Down
4 changes: 3 additions & 1 deletion crates/noseyparker/src/provenance.rs
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,7 @@ pub struct GitRepoProvenance {

/// What is the kind of this commit metadata?
#[derive(Debug, Copy, Clone, Serialize, PartialEq, Eq, Hash)]
#[serde(rename_all = "snake_case", tag = "kind")]
#[serde(rename_all = "snake_case")]
pub enum CommitKind {
/// The first commit in which a blob was seen
FirstSeen,
Expand Down Expand Up @@ -129,9 +129,11 @@ impl std::fmt::Display for CommitKind {
}
}

/// How was a particular Git commit encountered?
#[derive(Debug, Clone, Serialize, PartialEq, Eq, Hash)]
pub struct CommitProvenance {
pub commit_kind: CommitKind,

pub commit_metadata: CommitMetadata,

#[serde(with = "BStringSerde")]
Expand Down
9 changes: 7 additions & 2 deletions crates/noseyparker/src/provenance_set.rs
Original file line number Diff line number Diff line change
Expand Up @@ -76,10 +76,15 @@ impl ProvenanceSet {
pub fn iter(&self) -> impl Iterator<Item = &Provenance> {
std::iter::once(&self.provenance).chain(&self.more_provenance)
}
}

impl IntoIterator for ProvenanceSet {
type Item = Provenance;
type IntoIter =
std::iter::Chain<std::iter::Once<Provenance>, <Vec<Provenance> as IntoIterator>::IntoIter>;

#[allow(clippy::should_implement_trait)]
#[inline]
pub fn into_iter(self) -> impl IntoIterator<Item = Provenance> {
fn into_iter(self) -> Self::IntoIter {
std::iter::once(self.provenance).chain(self.more_provenance)
}
}

0 comments on commit 47a46a1

Please sign in to comment.