Skip to content

Commit

Permalink
Initial discovery and classification CLI implementation
Browse files Browse the repository at this point in the history
  • Loading branch information
ccampo133 committed Mar 27, 2024
1 parent 16f6769 commit 977618a
Show file tree
Hide file tree
Showing 77 changed files with 6,257 additions and 6 deletions.
20 changes: 20 additions & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
FROM golang:1.22 as build

# Set destination for COPY.
WORKDIR /app

# Download dependencies.
COPY go.mod go.sum ./
RUN go mod download

# Copy the source code.
COPY . .

# Build.
RUN CGO_ENABLED=0 go build -o dmap cmd/*.go

FROM gcr.io/distroless/static-debian12:nonroot

COPY --from=build /app/dmap /dmap

ENTRYPOINT ["/dmap"]
153 changes: 153 additions & 0 deletions classification/classification.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,153 @@
// Package classification provides various types and functions to facilitate
// data classification. The type Classifier provides an interface which takes
// sampled data as input and returns a classified version of that sample as
// output. The package contains at least one implementation which uses Rego and
// OPA to perform the actual classification logic (see LabelClassifier), however
// other implementations may be added in the future.
package classification

import (
"context"
"fmt"

"github.com/cyralinc/dmap/discovery/repository"
)

// TODO: godoc -ccampo 2024-03-27
type ClassifiedTable struct {
Repo string `json:"repo"`
Catalog string `json:"catalog"`
Schema string `json:"schema"`
Table string `json:"table"`
}

// Result represents the classification of a data attribute.
type Result struct {
Table *ClassifiedTable `json:"table"`
AttributeName string `json:"attributeName"`
Classifications []*Label `json:"classifications"`
}

// Classifier implementations know how to turn a row of data into a sequence of
// classification results.
type Classifier interface {
// Classify takes as input what amounts to a "row of data": complete
// information about where the table comes from as well as a list of columns
// and attributeValues. While the values might be any data type, by the time
// we reach here, we expect the values to be represented as strings.
//
// For a given attribute, if it is classified as belonging to a particular
// classification group, we will add an instance for it in the Result.
// If however, there is no assigned classification, we will skip it in the
// results. A zero length return value is normal if none of the attributes
// matched the classification requirements.
Classify(
ctx context.Context,
table *ClassifiedTable,
attrs map[string]any,
) ([]Result, error)
}

// ClassifySamples uses the provided Classifier to classify the sample data
// passed via the "samples" parameter. It is mostly a helper function which
// loops through each repository.Sample, retrieves the attribute names and
// values of that sample, passes them to Classifier.Classify, and then
// aggregates the results. Please see the documentation for Classifier and its
// Classify method for more details. The returned slice represents all the
// unique classification results for a given sample set.
func ClassifySamples(
ctx context.Context,
classifier Classifier,
samples []repository.Sample,
) ([]Result, error) {
var classifications []Result
for _, sample := range samples {
table := ClassifiedTable{
Repo: sample.Metadata.Repo,
Catalog: sample.Metadata.Database,
Schema: sample.Metadata.Schema,
Table: sample.Metadata.Table,
}
// Classify each sampled row
for _, sampleResult := range sample.Results {
res, err := classifier.Classify(ctx, &table, sampleResult)
if err != nil {
return nil, fmt.Errorf("error classifying sample: %w", err)
}
classifications = append(classifications, res...)
}
}
return combineAndDedupe(classifications), nil
}

// AggregateClassifySamples classifies the given samples with every classifier,
// and returns the aggregate result slice. For details on how each
// classification is executed, see ClassifySamples.
func AggregateClassifySamples(
ctx context.Context,
classifiers map[string]Classifier,
samples []repository.Sample,
) ([]Result, error) {
classifications := make([]Result, 0)
for _, classifier := range classifiers {
classified, err := ClassifySamples(ctx, classifier, samples)
if err != nil {
return nil, err
}
classifications = append(classifications, classified...)
}
return classifications, nil
}

// combineAndDedupe takes a slice of Result and combines the individual elements
// when they have the same schema/table/attribute, but different labels, into a
// Result element with combined labels. Additionally, only distinct results by
// schema, table, and attribute are present in the return slice.
func combineAndDedupe(results []Result) []Result {
set := make(map[tableAttrLabel]bool)
distinctLabels := make(map[tableAttr][]*Label)
for _, result := range results {
for _, lbl := range result.Classifications {
key := tableAttrLabel{
ta: tableAttr{
table: *result.Table,
attr: result.AttributeName,
},
label: lbl.Name,
}

if !set[key] {
set[key] = true
distinctLabels[key.ta] = append(distinctLabels[key.ta], lbl)
}
}
}

distinctResults := make([]Result, 0, len(distinctLabels))
for ta, labels := range distinctLabels {
result := Result{
Table: &ClassifiedTable{
Repo: ta.table.Repo,
Catalog: ta.table.Catalog,
Schema: ta.table.Schema,
Table: ta.table.Table,
},
AttributeName: ta.attr,
Classifications: labels,
}
distinctResults = append(distinctResults, result)
}

return distinctResults
}

// Both tableAttr and tableAttrLabel are only used as map keys
type tableAttr struct {
table ClassifiedTable
attr string
}

type tableAttrLabel struct {
ta tableAttr
label string
}
213 changes: 213 additions & 0 deletions classification/classification_fakeclassifier_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,213 @@
package classification

import (
"context"
"testing"

"github.com/stretchr/testify/assert"

"github.com/cyralinc/dmap/discovery/repository"
)

type classifyFunc func(table *ClassifiedTable, attrs map[string]any) ([]Result, error)

type fakeClassifier struct {
classify classifyFunc
}

var _ Classifier = (*fakeClassifier)(nil)

func (f fakeClassifier) Classify(
_ context.Context,
table *ClassifiedTable,
attrs map[string]any,
) ([]Result, error) {
return f.classify(table, attrs)
}

func Test_Classify_FakeClassifier_SingleSample(t *testing.T) {
sample := repository.Sample{
Metadata: repository.SampleMetadata{
Repo: repoName,
Database: catalogName,
Schema: schemaName,
Table: tableName,
},
Results: []repository.SampleResult{
{
"age": "52",
"social_sec_num": "512-23-4256",
"credit_card_num": "4111111111111111",
},
{
"age": "53",
"social_sec_num": "512-23-4258",
"credit_card_num": "4111111111111111",
},
},
}

table := ClassifiedTable{
Repo: repoName,
Catalog: catalogName,
Schema: schemaName,
Table: tableName,
}

expected := []Result{
{
Table: &table,
AttributeName: "age",
Classifications: []*Label{{Name: "PII"}},
},
{
Table: &table,
AttributeName: "social_sec_num",
Classifications: []*Label{{Name: "PII"}, {Name: "PRIVATE"}},
},
{
Table: &table,
AttributeName: "credit_card_num",
Classifications: []*Label{{Name: "PII"}, {Name: "CCN"}, {Name: "PCI"}},
},
}

classifier := fakeClassifier{
classify: func(
table *ClassifiedTable,
attrs map[string]any,
) ([]Result, error) {
return expected, nil
},
}

actual, err := ClassifySamples(context.Background(), classifier, []repository.Sample{sample})
assert.NoError(t, err)
assert.ElementsMatch(t, expected, actual)
}

func Test_AggregateClassify_FakeClassifier_MultipleSamples(t *testing.T) {
tableName1 := tableName + "1"
tableName2 := tableName + "2"
ageAttr := "age"
ssnAttr := "social_sec_num"
ccnAttr := "credit_card_num"
samples := []repository.Sample{
{
Metadata: repository.SampleMetadata{
Repo: repoName,
Database: catalogName,
Schema: schemaName,
Table: tableName1,
},
Results: []repository.SampleResult{
{
ageAttr: "52",
ssnAttr: "512-23-4256",
ccnAttr: "4111111111111111",
},
{
ageAttr: "53",
ssnAttr: "512-23-4258",
ccnAttr: "4111111111111111",
},
},
},
{
Metadata: repository.SampleMetadata{
Repo: repoName,
Database: catalogName,
Schema: schemaName,
Table: tableName2,
},
Results: []repository.SampleResult{
{
ageAttr: "21",
ssnAttr: "123-45-6789",
ccnAttr: "0123456789012345",
},
{
ageAttr: "22",
ssnAttr: "987-65-4321",
ccnAttr: "0123456789012345",
},
},
},
}

table1 := ClassifiedTable{
Repo: repoName,
Catalog: catalogName,
Schema: schemaName,
Table: tableName1,
}
table2 := ClassifiedTable{
Repo: repoName,
Catalog: catalogName,
Schema: schemaName,
Table: tableName2,
}

expectedFromClassifier1 := []Result{
{
Table: &table1,
AttributeName: ageAttr,
Classifications: []*Label{{Name: "PII"}},
},
{
Table: &table1,
AttributeName: ssnAttr,
Classifications: []*Label{{Name: "PII"}, {Name: "PRIVATE"}},
},
{
Table: &table1,
AttributeName: ccnAttr,
Classifications: []*Label{{Name: "PII"}, {Name: "CCN"}, {Name: "PCI"}},
},
}
expectedFromClassifier2 := []Result{
{
Table: &table2,
AttributeName: ageAttr,
Classifications: []*Label{{Name: "PII"}},
},
{
Table: &table2,
AttributeName: ssnAttr,
Classifications: []*Label{{Name: "PII"}, {Name: "PRIVATE"}},
},
{
Table: &table2,
AttributeName: ccnAttr,
Classifications: []*Label{{Name: "PII"}, {Name: "CCN"}, {Name: "PCI"}},
},
}
expected := append(expectedFromClassifier1, expectedFromClassifier2...)

classifiers := map[string]Classifier{
"classifier1": Classifier(
fakeClassifier{
classify: func(
table *ClassifiedTable,
attrs map[string]any,
) ([]Result, error) {
return expectedFromClassifier1, nil
},
},
),
"classifier2": Classifier(
fakeClassifier{
classify: func(
table *ClassifiedTable,
attrs map[string]any,
) ([]Result, error) {
return expectedFromClassifier2, nil
},
},
),
}

actual, err := AggregateClassifySamples(context.Background(), classifiers, samples)
assert.NoError(t, err)
assert.ElementsMatch(t, expected, actual)
}
Loading

0 comments on commit 977618a

Please sign in to comment.