From 977618a96e2bff7ae12913028971a6c85032e240 Mon Sep 17 00:00:00 2001 From: Chris Campo Date: Wed, 27 Mar 2024 12:13:57 -0400 Subject: [PATCH 01/14] Initial discovery and classification CLI implementation --- Dockerfile | 20 + classification/classification.go | 153 ++++++ .../classification_fakeclassifier_test.go | 213 ++++++++ classification/examples/example_policy.rego | 70 +++ .../examples/example_policy_simple.rego | 22 + classification/label.go | 43 ++ classification/publisher/publisher.go | 21 + classification/publisher/stdout.go | 39 ++ classification/rego/README.md | 8 + classification/rego/address.rego | 34 ++ classification/rego/address_test.rego | 61 +++ classification/rego/age.rego | 15 + classification/rego/age_test.rego | 37 ++ classification/rego/ccn.rego | 17 + classification/rego/ccn_test.rego | 26 + classification/rego/cvv.rego | 58 +++ classification/rego/cvv_test.rego | 29 ++ classification/rego/dob.rego | 45 ++ classification/rego/dob_test.rego | 110 +++++ classification/rego/email.rego | 17 + classification/rego/email_test.rego | 13 + classification/rego/first_name.rego | 20 + classification/rego/first_name_test.rego | 49 ++ classification/rego/full_name.rego | 15 + classification/rego/full_name_test.rego | 25 + classification/rego/ip_address.rego | 28 ++ classification/rego/ip_address_test.rego | 25 + classification/rego/labels.yaml | 77 +++ classification/rego/last_name.rego | 20 + classification/rego/last_name_test.rego | 49 ++ classification/rego/passport.rego | 22 + classification/rego/passport_test.rego | 65 +++ classification/rego/phone_number.rego | 17 + classification/rego/phone_number_test.rego | 13 + classification/rego/ssn.rego | 24 + classification/rego/ssn_test.rego | 73 +++ classification/rego_classifier.go | 126 +++++ classification/rego_classifier_test.go | 457 ++++++++++++++++++ cmd/main.go | 75 +++ cmd/repo_scan.go | 18 + discovery/config/config.go | 59 +++ discovery/config/util.go | 138 ++++++ discovery/config/util_test.go | 178 +++++++ discovery/repository/denodo/denodo.go | 114 +++++ discovery/repository/doc.go | 15 + discovery/repository/gen.go | 5 + discovery/repository/genericsql/genericsql.go | 303 ++++++++++++ .../repository/genericsql/genericsql_test.go | 301 ++++++++++++ discovery/repository/genericsql/util.go | 35 ++ discovery/repository/metadata.go | 81 ++++ discovery/repository/metadata_test.go | 71 +++ discovery/repository/mock_repository.go | 342 +++++++++++++ discovery/repository/mysql/mysql.go | 97 ++++ discovery/repository/mysql/mysql_test.go | 31 ++ discovery/repository/oracle/config.go | 27 ++ discovery/repository/oracle/oracle.go | 125 +++++ discovery/repository/oracle/oracle_test.go | 86 ++++ discovery/repository/postgresql/config.go | 22 + discovery/repository/postgresql/postgresql.go | 103 ++++ .../repository/postgresql/postgresql_test.go | 31 ++ discovery/repository/redshift/redshift.go | 94 ++++ .../repository/redshift/redshift_test.go | 32 ++ discovery/repository/repository.go | 237 +++++++++ discovery/repository/repository_test.go | 134 +++++ .../repository/sample_all_databases_test.go | 228 +++++++++ .../repository/sample_repository_test.go | 181 +++++++ discovery/repository/sampling.go | 56 +++ discovery/repository/sampling_test.go | 77 +++ discovery/repository/snowflake/config.go | 36 ++ discovery/repository/snowflake/snowflake.go | 100 ++++ .../repository/snowflake/snowflake_test.go | 31 ++ discovery/repository/sqlserver/sqlserver.go | 104 ++++ .../repository/sqlserver/sqlserver_test.go | 31 ++ discovery/scanner.go | 161 ++++++ go.mod | 76 ++- go.sum | 269 ++++++++++- scan/scanner.go | 3 +- 77 files changed, 6257 insertions(+), 6 deletions(-) create mode 100644 Dockerfile create mode 100644 classification/classification.go create mode 100644 classification/classification_fakeclassifier_test.go create mode 100644 classification/examples/example_policy.rego create mode 100644 classification/examples/example_policy_simple.rego create mode 100644 classification/label.go create mode 100644 classification/publisher/publisher.go create mode 100644 classification/publisher/stdout.go create mode 100644 classification/rego/README.md create mode 100644 classification/rego/address.rego create mode 100644 classification/rego/address_test.rego create mode 100644 classification/rego/age.rego create mode 100644 classification/rego/age_test.rego create mode 100644 classification/rego/ccn.rego create mode 100644 classification/rego/ccn_test.rego create mode 100644 classification/rego/cvv.rego create mode 100644 classification/rego/cvv_test.rego create mode 100644 classification/rego/dob.rego create mode 100644 classification/rego/dob_test.rego create mode 100644 classification/rego/email.rego create mode 100644 classification/rego/email_test.rego create mode 100644 classification/rego/first_name.rego create mode 100644 classification/rego/first_name_test.rego create mode 100644 classification/rego/full_name.rego create mode 100644 classification/rego/full_name_test.rego create mode 100644 classification/rego/ip_address.rego create mode 100644 classification/rego/ip_address_test.rego create mode 100644 classification/rego/labels.yaml create mode 100644 classification/rego/last_name.rego create mode 100644 classification/rego/last_name_test.rego create mode 100644 classification/rego/passport.rego create mode 100644 classification/rego/passport_test.rego create mode 100644 classification/rego/phone_number.rego create mode 100644 classification/rego/phone_number_test.rego create mode 100644 classification/rego/ssn.rego create mode 100644 classification/rego/ssn_test.rego create mode 100644 classification/rego_classifier.go create mode 100644 classification/rego_classifier_test.go create mode 100644 cmd/main.go create mode 100644 cmd/repo_scan.go create mode 100644 discovery/config/config.go create mode 100644 discovery/config/util.go create mode 100644 discovery/config/util_test.go create mode 100644 discovery/repository/denodo/denodo.go create mode 100644 discovery/repository/doc.go create mode 100644 discovery/repository/gen.go create mode 100644 discovery/repository/genericsql/genericsql.go create mode 100644 discovery/repository/genericsql/genericsql_test.go create mode 100644 discovery/repository/genericsql/util.go create mode 100644 discovery/repository/metadata.go create mode 100644 discovery/repository/metadata_test.go create mode 100644 discovery/repository/mock_repository.go create mode 100644 discovery/repository/mysql/mysql.go create mode 100644 discovery/repository/mysql/mysql_test.go create mode 100644 discovery/repository/oracle/config.go create mode 100644 discovery/repository/oracle/oracle.go create mode 100644 discovery/repository/oracle/oracle_test.go create mode 100644 discovery/repository/postgresql/config.go create mode 100644 discovery/repository/postgresql/postgresql.go create mode 100644 discovery/repository/postgresql/postgresql_test.go create mode 100644 discovery/repository/redshift/redshift.go create mode 100644 discovery/repository/redshift/redshift_test.go create mode 100644 discovery/repository/repository.go create mode 100644 discovery/repository/repository_test.go create mode 100644 discovery/repository/sample_all_databases_test.go create mode 100644 discovery/repository/sample_repository_test.go create mode 100644 discovery/repository/sampling.go create mode 100644 discovery/repository/sampling_test.go create mode 100644 discovery/repository/snowflake/config.go create mode 100644 discovery/repository/snowflake/snowflake.go create mode 100644 discovery/repository/snowflake/snowflake_test.go create mode 100644 discovery/repository/sqlserver/sqlserver.go create mode 100644 discovery/repository/sqlserver/sqlserver_test.go create mode 100644 discovery/scanner.go diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..a35bd86 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,20 @@ +FROM golang:1.22 as build + +# Set destination for COPY. +WORKDIR /app + +# Download dependencies. +COPY go.mod go.sum ./ +RUN go mod download + +# Copy the source code. +COPY . . + +# Build. +RUN CGO_ENABLED=0 go build -o dmap cmd/*.go + +FROM gcr.io/distroless/static-debian12:nonroot + +COPY --from=build /app/dmap /dmap + +ENTRYPOINT ["/dmap"] diff --git a/classification/classification.go b/classification/classification.go new file mode 100644 index 0000000..43682eb --- /dev/null +++ b/classification/classification.go @@ -0,0 +1,153 @@ +// Package classification provides various types and functions to facilitate +// data classification. The type Classifier provides an interface which takes +// sampled data as input and returns a classified version of that sample as +// output. The package contains at least one implementation which uses Rego and +// OPA to perform the actual classification logic (see LabelClassifier), however +// other implementations may be added in the future. +package classification + +import ( + "context" + "fmt" + + "github.com/cyralinc/dmap/discovery/repository" +) + +// TODO: godoc -ccampo 2024-03-27 +type ClassifiedTable struct { + Repo string `json:"repo"` + Catalog string `json:"catalog"` + Schema string `json:"schema"` + Table string `json:"table"` +} + +// Result represents the classification of a data attribute. +type Result struct { + Table *ClassifiedTable `json:"table"` + AttributeName string `json:"attributeName"` + Classifications []*Label `json:"classifications"` +} + +// Classifier implementations know how to turn a row of data into a sequence of +// classification results. +type Classifier interface { + // Classify takes as input what amounts to a "row of data": complete + // information about where the table comes from as well as a list of columns + // and attributeValues. While the values might be any data type, by the time + // we reach here, we expect the values to be represented as strings. + // + // For a given attribute, if it is classified as belonging to a particular + // classification group, we will add an instance for it in the Result. + // If however, there is no assigned classification, we will skip it in the + // results. A zero length return value is normal if none of the attributes + // matched the classification requirements. + Classify( + ctx context.Context, + table *ClassifiedTable, + attrs map[string]any, + ) ([]Result, error) +} + +// ClassifySamples uses the provided Classifier to classify the sample data +// passed via the "samples" parameter. It is mostly a helper function which +// loops through each repository.Sample, retrieves the attribute names and +// values of that sample, passes them to Classifier.Classify, and then +// aggregates the results. Please see the documentation for Classifier and its +// Classify method for more details. The returned slice represents all the +// unique classification results for a given sample set. +func ClassifySamples( + ctx context.Context, + classifier Classifier, + samples []repository.Sample, +) ([]Result, error) { + var classifications []Result + for _, sample := range samples { + table := ClassifiedTable{ + Repo: sample.Metadata.Repo, + Catalog: sample.Metadata.Database, + Schema: sample.Metadata.Schema, + Table: sample.Metadata.Table, + } + // Classify each sampled row + for _, sampleResult := range sample.Results { + res, err := classifier.Classify(ctx, &table, sampleResult) + if err != nil { + return nil, fmt.Errorf("error classifying sample: %w", err) + } + classifications = append(classifications, res...) + } + } + return combineAndDedupe(classifications), nil +} + +// AggregateClassifySamples classifies the given samples with every classifier, +// and returns the aggregate result slice. For details on how each +// classification is executed, see ClassifySamples. +func AggregateClassifySamples( + ctx context.Context, + classifiers map[string]Classifier, + samples []repository.Sample, +) ([]Result, error) { + classifications := make([]Result, 0) + for _, classifier := range classifiers { + classified, err := ClassifySamples(ctx, classifier, samples) + if err != nil { + return nil, err + } + classifications = append(classifications, classified...) + } + return classifications, nil +} + +// combineAndDedupe takes a slice of Result and combines the individual elements +// when they have the same schema/table/attribute, but different labels, into a +// Result element with combined labels. Additionally, only distinct results by +// schema, table, and attribute are present in the return slice. +func combineAndDedupe(results []Result) []Result { + set := make(map[tableAttrLabel]bool) + distinctLabels := make(map[tableAttr][]*Label) + for _, result := range results { + for _, lbl := range result.Classifications { + key := tableAttrLabel{ + ta: tableAttr{ + table: *result.Table, + attr: result.AttributeName, + }, + label: lbl.Name, + } + + if !set[key] { + set[key] = true + distinctLabels[key.ta] = append(distinctLabels[key.ta], lbl) + } + } + } + + distinctResults := make([]Result, 0, len(distinctLabels)) + for ta, labels := range distinctLabels { + result := Result{ + Table: &ClassifiedTable{ + Repo: ta.table.Repo, + Catalog: ta.table.Catalog, + Schema: ta.table.Schema, + Table: ta.table.Table, + }, + AttributeName: ta.attr, + Classifications: labels, + } + distinctResults = append(distinctResults, result) + } + + return distinctResults +} + +// Both tableAttr and tableAttrLabel are only used as map keys +type tableAttr struct { + table ClassifiedTable + attr string +} + +type tableAttrLabel struct { + ta tableAttr + label string +} diff --git a/classification/classification_fakeclassifier_test.go b/classification/classification_fakeclassifier_test.go new file mode 100644 index 0000000..5e41169 --- /dev/null +++ b/classification/classification_fakeclassifier_test.go @@ -0,0 +1,213 @@ +package classification + +import ( + "context" + "testing" + + "github.com/stretchr/testify/assert" + + "github.com/cyralinc/dmap/discovery/repository" +) + +type classifyFunc func(table *ClassifiedTable, attrs map[string]any) ([]Result, error) + +type fakeClassifier struct { + classify classifyFunc +} + +var _ Classifier = (*fakeClassifier)(nil) + +func (f fakeClassifier) Classify( + _ context.Context, + table *ClassifiedTable, + attrs map[string]any, +) ([]Result, error) { + return f.classify(table, attrs) +} + +func Test_Classify_FakeClassifier_SingleSample(t *testing.T) { + sample := repository.Sample{ + Metadata: repository.SampleMetadata{ + Repo: repoName, + Database: catalogName, + Schema: schemaName, + Table: tableName, + }, + Results: []repository.SampleResult{ + { + "age": "52", + "social_sec_num": "512-23-4256", + "credit_card_num": "4111111111111111", + }, + { + "age": "53", + "social_sec_num": "512-23-4258", + "credit_card_num": "4111111111111111", + }, + }, + } + + table := ClassifiedTable{ + Repo: repoName, + Catalog: catalogName, + Schema: schemaName, + Table: tableName, + } + + expected := []Result{ + { + Table: &table, + AttributeName: "age", + Classifications: []*Label{{Name: "PII"}}, + }, + { + Table: &table, + AttributeName: "social_sec_num", + Classifications: []*Label{{Name: "PII"}, {Name: "PRIVATE"}}, + }, + { + Table: &table, + AttributeName: "credit_card_num", + Classifications: []*Label{{Name: "PII"}, {Name: "CCN"}, {Name: "PCI"}}, + }, + } + + classifier := fakeClassifier{ + classify: func( + table *ClassifiedTable, + attrs map[string]any, + ) ([]Result, error) { + return expected, nil + }, + } + + actual, err := ClassifySamples(context.Background(), classifier, []repository.Sample{sample}) + assert.NoError(t, err) + assert.ElementsMatch(t, expected, actual) +} + +func Test_AggregateClassify_FakeClassifier_MultipleSamples(t *testing.T) { + tableName1 := tableName + "1" + tableName2 := tableName + "2" + ageAttr := "age" + ssnAttr := "social_sec_num" + ccnAttr := "credit_card_num" + samples := []repository.Sample{ + { + Metadata: repository.SampleMetadata{ + Repo: repoName, + Database: catalogName, + Schema: schemaName, + Table: tableName1, + }, + Results: []repository.SampleResult{ + { + ageAttr: "52", + ssnAttr: "512-23-4256", + ccnAttr: "4111111111111111", + }, + { + ageAttr: "53", + ssnAttr: "512-23-4258", + ccnAttr: "4111111111111111", + }, + }, + }, + { + Metadata: repository.SampleMetadata{ + Repo: repoName, + Database: catalogName, + Schema: schemaName, + Table: tableName2, + }, + Results: []repository.SampleResult{ + { + ageAttr: "21", + ssnAttr: "123-45-6789", + ccnAttr: "0123456789012345", + }, + { + ageAttr: "22", + ssnAttr: "987-65-4321", + ccnAttr: "0123456789012345", + }, + }, + }, + } + + table1 := ClassifiedTable{ + Repo: repoName, + Catalog: catalogName, + Schema: schemaName, + Table: tableName1, + } + table2 := ClassifiedTable{ + Repo: repoName, + Catalog: catalogName, + Schema: schemaName, + Table: tableName2, + } + + expectedFromClassifier1 := []Result{ + { + Table: &table1, + AttributeName: ageAttr, + Classifications: []*Label{{Name: "PII"}}, + }, + { + Table: &table1, + AttributeName: ssnAttr, + Classifications: []*Label{{Name: "PII"}, {Name: "PRIVATE"}}, + }, + { + Table: &table1, + AttributeName: ccnAttr, + Classifications: []*Label{{Name: "PII"}, {Name: "CCN"}, {Name: "PCI"}}, + }, + } + expectedFromClassifier2 := []Result{ + { + Table: &table2, + AttributeName: ageAttr, + Classifications: []*Label{{Name: "PII"}}, + }, + { + Table: &table2, + AttributeName: ssnAttr, + Classifications: []*Label{{Name: "PII"}, {Name: "PRIVATE"}}, + }, + { + Table: &table2, + AttributeName: ccnAttr, + Classifications: []*Label{{Name: "PII"}, {Name: "CCN"}, {Name: "PCI"}}, + }, + } + expected := append(expectedFromClassifier1, expectedFromClassifier2...) + + classifiers := map[string]Classifier{ + "classifier1": Classifier( + fakeClassifier{ + classify: func( + table *ClassifiedTable, + attrs map[string]any, + ) ([]Result, error) { + return expectedFromClassifier1, nil + }, + }, + ), + "classifier2": Classifier( + fakeClassifier{ + classify: func( + table *ClassifiedTable, + attrs map[string]any, + ) ([]Result, error) { + return expectedFromClassifier2, nil + }, + }, + ), + } + + actual, err := AggregateClassifySamples(context.Background(), classifiers, samples) + assert.NoError(t, err) + assert.ElementsMatch(t, expected, actual) +} diff --git a/classification/examples/example_policy.rego b/classification/examples/example_policy.rego new file mode 100644 index 0000000..f872855 --- /dev/null +++ b/classification/examples/example_policy.rego @@ -0,0 +1,70 @@ +package classifier + +import rego.v1 + +output[k] := v if { + some k in object.keys(input) + v := classify(k, input[k]) +} + +default classify(_, _) := "UNLABELED" + +classify(_, val) := "EMAIL" if { + regex.match( + `\A[A-Za-z0-9][A-Za-z0-9._%+-]*@[A-Za-z0-9]((\.[A-Za-z0-9])|(-[A-Za-z0-9])|[A-Za-z0-9])*\.[A-Za-z]{2,}\z`, + val + ) +} + +classify(_, val) := "PHONE" if { + regex.match( + `\A((1(-| )?((\([2-9]\d{2}\))|([2-9]\d{2})))|([2-9]\d{2})|(\([2-9]\d{2}\)))(-| )?[2-9]((1[02-9])|([02-9]\d))(-| )?\d{4}\z`, + val + ) +} + +classify(_, val) := "SSN" if { + regex.match( + `\A((((66[0-57-9])|(6[0-57-7]\d)|(00[1-9])|(0[1-9]\d)|([1-578]\d{2}))-((0[1-9])|([1-9]\d))-((000[1-9])|(00[1-9]\d)|(0[1-9]\d{2})|([1-9]\d{3})))|(((66[0-57-9])|(6[0-57-7]\d)|(00[1-9])|(0[1-9]\d)|([1-578]\d{2}))+((0[1-9])|([1-9]\d))+((000[1-9])|(00[1-9]\d)|(0[1-9]\d{2})|([1-9]\d{3})))|(((66[0-57-9])|(6[0-57-7]\d)|(00[1-9])|(0[1-9]\d)|([1-578]\d{2}))\.((0[1-9])|([1-9]\d))\.((000[1-9])|(00[1-9]\d)|(0[1-9]\d{2})|([1-9]\d{3})))|(((66[0-57-9])|(6[0-57-7]\d)|(00[1-9])|(0[1-9]\d)|([1-578]\d{2}))((0[1-9])|([1-9]\d))((000[1-9])|(00[1-9]\d)|(0[1-9]\d{2})|([1-9]\d{3}))))\z`, + val + ) +} + +classify(_, val) := "CCN" if { + regex.match( + `\A(4[0-9]{3}[0-9]{4}[0-9]{4}[0-9](?:[0-9]{3})?|[0-9]{4}[-][0-9]{4}[-][0-9]{4}[-][0-9]{4}|5[1-5][0-9]{2}[0-9]{4}[0-9]{4}[0-9]{4}|3[47][0-9]{13}|3(?:0[0-5]|[68][0-9])[0-9]{11}|6(?:011|5[0-9]{2})[0-9]{12}|(?:2131|1800|35\d{3})\d{11})\z`, + val + ) +} + +classify(_, val) := "IP_ADDRESS" if { + regex.match( + `\A(?:(?:[0-9A-Fa-f]{1,4}:){6}(?:[0-9A-Fa-f]{1,4}:[0-9A-Fa-f]{1,4}|(?:(?:[0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])\.){3}(?:[0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5]))|::(?:[0-9A-Fa-f]{1,4}:){5}(?:[0-9A-Fa-f]{1,4}:[0-9A-Fa-f]{1,4}|(?:(?:[0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])\.){3}(?:[0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5]))|(?:[0-9A-Fa-f]{1,4})?::(?:[0-9A-Fa-f]{1,4}:){4}(?:[0-9A-Fa-f]{1,4}:[0-9A-Fa-f]{1,4}|(?:(?:[0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])\.){3}(?:[0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5]))|(?:[0-9A-Fa-f]{1,4}:[0-9A-Fa-f]{1,4})?::(?:[0-9A-Fa-f]{1,4}:){3}(?:[0-9A-Fa-f]{1,4}:[0-9A-Fa-f]{1,4}|(?:(?:[0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])\.){3}(?:[0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5]))|(?:(?:[0-9A-Fa-f]{1,4}:){,2}[0-9A-Fa-f]{1,4})?::(?:[0-9A-Fa-f]{1,4}:){2}(?:[0-9A-Fa-f]{1,4}:[0-9A-Fa-f]{1,4}|(?:(?:[0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])\.){3}(?:[0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5]))|(?:(?:[0-9A-Fa-f]{1,4}:){,3}[0-9A-Fa-f]{1,4})?::[0-9A-Fa-f]{1,4}:(?:[0-9A-Fa-f]{1,4}:[0-9A-Fa-f]{1,4}|(?:(?:[0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])\.){3}(?:[0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5]))|(?:(?:[0-9A-Fa-f]{1,4}:){,4}[0-9A-Fa-f]{1,4})?::(?:[0-9A-Fa-f]{1,4}:[0-9A-Fa-f]{1,4}|(?:(?:[0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])\.){3}(?:[0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5]))|(?:(?:[0-9A-Fa-f]{1,4}:){,5}[0-9A-Fa-f]{1,4})?::[0-9A-Fa-f]{1,4}|(?:(?:[0-9A-Fa-f]{1,4}:){,6}[0-9A-Fa-f]{1,4})?::)\z`, + val + ) +} + +classify(key, val) := "AGE" if { + lower(key) == "age" + regex.match(`\A((\d{1,2})|1[0-1]\d)\z`, val) +} + +classify(key, _) := "ADDRESS" if { + regex.match(`\A.*address.*\z`, lower(key)) +} + +classify(key, _) := "ADDRESS" if { + regex.match(`\Astreet.*\z`, lower(key)) +} + +classify(key, _) := "ADDRESS" if { + lower(key) == "state" +} + +classify(key, _) := "ADDRESS" if { + lower(key) == "zip" +} + +classify(key, _) := "ADDRESS" if { + lower(key) == "zipcode" +} diff --git a/classification/examples/example_policy_simple.rego b/classification/examples/example_policy_simple.rego new file mode 100644 index 0000000..9a2c312 --- /dev/null +++ b/classification/examples/example_policy_simple.rego @@ -0,0 +1,22 @@ +package classifier + +import rego.v1 + +output[k] := v if { + some k in object.keys(input) + v := classify(k, input[k]) +} + +default classify(_, _) := "UNLABELED" + +classify(_, val) := "CCN" if { + regex.match( + `\A(4[0-9]{3}[0-9]{4}[0-9]{4}[0-9](?:[0-9]{3})?|[0-9]{4}[-][0-9]{4}[-][0-9]{4}[-][0-9]{4}|5[1-5][0-9]{2}[0-9]{4}[0-9]{4}[0-9]{4}|3[47][0-9]{13}|3(?:0[0-5]|[68][0-9])[0-9]{11}|6(?:011|5[0-9]{2})[0-9]{12}|(?:2131|1800|35\d{3})\d{11})\z`, + val + ) +} + +classify(key, val) := "AGE" if { + contains(lower(key), "age") + regex.match(`\A((\d{1,2})|1[0-1]\d)\z`, val) +} diff --git a/classification/label.go b/classification/label.go new file mode 100644 index 0000000..42709fb --- /dev/null +++ b/classification/label.go @@ -0,0 +1,43 @@ +package classification + +import ( + "embed" + "fmt" + "strings" + + "gopkg.in/yaml.v3" +) + +// Label represents a data classification label. +type Label struct { + Name string `json:"name"` + Description string `json:"description"` + Tags []string `json:"tags"` + ClassificationRule string `json:"-"` +} + +//go:embed rego/*.rego +var regoFs embed.FS + +//go:embed rego/labels.yaml +var labelsYaml string + +// GetLabels returns a slice of all known labels defined in the labels.yaml +// file. +func GetLabels() ([]*Label, error) { + lbls := struct { + Labels []*Label `yaml:"labels"` + }{} + if err := yaml.Unmarshal([]byte(labelsYaml), &lbls); err != nil { + return nil, fmt.Errorf("error unmarshalling labels.yaml: %w", err) + } + for _, lbl := range lbls.Labels { + fname := "rego/" + strings.ReplaceAll(strings.ToLower(lbl.Name), " ", "_") + ".rego" + b, err := regoFs.ReadFile(fname) + if err != nil { + return nil, fmt.Errorf("error reading rego file %s: %w", fname, err) + } + lbl.ClassificationRule = string(b) + } + return lbls.Labels, nil +} diff --git a/classification/publisher/publisher.go b/classification/publisher/publisher.go new file mode 100644 index 0000000..97a33b7 --- /dev/null +++ b/classification/publisher/publisher.go @@ -0,0 +1,21 @@ +// Package publisher provides types for publishing data classifications results +// to an external source (Publisher), such as to stdout (StdOutPublisher). +package publisher + +import ( + "context" + + "github.com/cyralinc/dmap/classification" +) + +// Publisher publishes classification and discovery results to some destination, +// which is left up to the implementer. +type Publisher interface { + // PublishClassifications publishes a list of classification.Result to + // some destination. Any error(s) during publication should be returned. + PublishClassifications( + ctx context.Context, + repoId string, + results []classification.Result, + ) error +} diff --git a/classification/publisher/stdout.go b/classification/publisher/stdout.go new file mode 100644 index 0000000..30b65b7 --- /dev/null +++ b/classification/publisher/stdout.go @@ -0,0 +1,39 @@ +package publisher + +import ( + "context" + "encoding/json" + "fmt" + + "github.com/cyralinc/dmap/classification" +) + +// StdOutPublisher "publishes" classification results to stdout in JSON format. +type StdOutPublisher struct{} + +// StdOutPublisher implements Publisher +var _ Publisher = (*StdOutPublisher)(nil) + +func NewStdOutPublisher() *StdOutPublisher { + return &StdOutPublisher{} +} + +func (c *StdOutPublisher) PublishClassifications( + _ context.Context, + repoId string, + results []classification.Result, +) error { + classifications := struct { + Repo string `json:"repo"` + Classifications []classification.Result `json:"classifications"` + }{ + Repo: repoId, + Classifications: results, + } + b, err := json.MarshalIndent(classifications, "", " ") + if err != nil { + return fmt.Errorf("failed to marshal classifications: %w", err) + } + fmt.Println(string(b)) + return nil +} diff --git a/classification/rego/README.md b/classification/rego/README.md new file mode 100644 index 0000000..c664cbe --- /dev/null +++ b/classification/rego/README.md @@ -0,0 +1,8 @@ +This directory contains all the data label definitions used for classification. +The label metadata is specified in the [`labels.yaml`](labels.yaml) file. Please +see that file's doc comment for more details. + +Additionally, the classification rule Rego code must be specified as a +`