Skip to content

Commit

Permalink
ENG-13633: Initial discovery and classification implementation (#51)
Browse files Browse the repository at this point in the history
Initial implementation of Dmap's discovery and classification feature.

The main entrypoint is the "dmap" CLI (see the cmd directory).

Note that some of the details like command name, parameters, etc. are subject to change until the first stable version is released.

Additionally, most of the code that powers the CLI has been added as public packages to the main module, enhancing the API of the existing Dmap library. Users can use these packages to implement their own discovery and classification tooling if desired. There are two new top-level packages added to the public API:
  * classification - provides an API to perform data classification on arbitrary string data.
  * sql - provides an API to introspect, sample, and scan (which is introspect + sample + classify) SQL data repositories.

A new RepoScanner interface was also added to the scan package.
  • Loading branch information
ccampo133 authored Apr 8, 2024
1 parent 101f1c2 commit f1d5fc0
Show file tree
Hide file tree
Showing 74 changed files with 5,824 additions and 65 deletions.
16 changes: 16 additions & 0 deletions .github/workflows/docker.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
name: Build CLI Docker Image

on:
push:
branches: [ "main" ]
pull_request:
branches: [ "main" ]

jobs:
build:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4

- name: Build Docker image
run: docker build . --file Dockerfile
32 changes: 32 additions & 0 deletions .github/workflows/rego.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
name: Rego Lint and Test

on:
push:
branches: [ "main" ]
pull_request:
branches: [ "main" ]

jobs:
lint-and-test:
runs-on: ubuntu-latest
steps:
- name: Check out repository code
uses: actions/checkout@v4

- name: Setup Regal
uses: StyraInc/setup-regal@v1
with:
version: latest

- name: Lint
# Disable the line-length check for now, as it's too strict: it flags
# the long regexes in various classification policies.
run: regal lint --format=github --disable=line-length ./classification/labels/

- name: Setup OPA
uses: open-policy-agent/setup-opa@v2
with:
version: latest

- name: Run OPA Tests
run: opa test ./classification/labels/*.rego -v
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -253,3 +253,6 @@ __debug_bin

# Backup files
*~

# Other
/.run/
20 changes: 20 additions & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
FROM golang:1.22 as build

# Set destination for COPY.
WORKDIR /app

# Download dependencies.
COPY go.mod go.sum ./
RUN go mod download

# Copy the source code.
COPY . .

# Build.
RUN CGO_ENABLED=0 go build -ldflags="-X main.version=$(git describe --tags --always)" -o dmap cmd/*.go

FROM gcr.io/distroless/static-debian12:nonroot

COPY --from=build /app/dmap /dmap

ENTRYPOINT ["/dmap"]
9 changes: 9 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -15,3 +15,12 @@ integration-test:

clean:
go clean -i ./...

opt-fmt:
opa fmt --write ./classification/labels

opa-lint:
regal lint --disable=line-length ./classification/labels/

opa-test:
opa test ./classification/labels/*.rego -v
16 changes: 8 additions & 8 deletions aws/scanner_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -159,19 +159,19 @@ func (s *AWSScannerTestSuite) TestScan() {
awsClientConstructor: func(awsConfig aws.Config) *awsClient {
return &awsClient{
config: awsConfig,
rds: &mock.MockRDSClient{
rds: &mock.RDSClient{
DBClusters: s.dummyRDSClusters,
DBInstances: s.dummyRDSInstances,
},
redshift: &mock.MockRedshiftClient{
redshift: &mock.RedshiftClient{
Clusters: s.dummyRedshiftClusters,
},
dynamodb: &mock.MockDynamoDBClient{
dynamodb: &mock.DynamoDBClient{
TableNames: s.dummyDynamoDBTableNames,
Table: s.dummyDynamoDBTable,
Tags: s.dummyDynamoDBTags,
},
s3: &mock.MockS3Client{
s3: &mock.S3Client{
Buckets: s.dummyS3Buckets,
Tags: s.dummyS3Tags,
},
Expand Down Expand Up @@ -330,23 +330,23 @@ func (s *AWSScannerTestSuite) TestScan_WithErrors() {
awsClientConstructor: func(awsConfig aws.Config) *awsClient {
return &awsClient{
config: awsConfig,
rds: &mock.MockRDSClient{
rds: &mock.RDSClient{
Errors: map[string]error{
"DescribeDBClusters": dummyError,
"DescribeDBInstances": dummyError,
},
},
redshift: &mock.MockRedshiftClient{
redshift: &mock.RedshiftClient{
Errors: map[string]error{
"DescribeClusters": dummyError,
},
},
dynamodb: &mock.MockDynamoDBClient{
dynamodb: &mock.DynamoDBClient{
Errors: map[string]error{
"ListTables": dummyError,
},
},
s3: &mock.MockS3Client{
s3: &mock.S3Client{
Errors: map[string]error{
"ListBuckets": dummyError,
"GetBucketTagging": dummyError,
Expand Down
51 changes: 51 additions & 0 deletions classification/classification.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
// Package classification provides various types and functions to facilitate
// data classification. The type Classifier provides an interface which takes
// arbitrary data as input and returns a classified version of that data as
// output. The package contains at least one implementation which uses OPA and
// Rego to perform the actual classification logic (see LabelClassifier),
// however other implementations may be added in the future.
package classification

import (
"context"
"encoding/json"
)

// Classifier is an interface that represents a data classifier. A classifier
// takes a set of data attributes and classifies them into a set of labels.
type Classifier interface {
// Classify takes the given input, which amounts to essentially a "row of
// data", and returns the data classifications for that input. The input is
// a map of attribute names (i.e. columns) to their values. The returned
// Result is a map of attribute names to the set of labels that attributes
// were classified as.
Classify(ctx context.Context, input map[string]any) (Result, error)
}

// Result represents the classifications for a set of data attributes. The key
// is the attribute (i.e. column) name and the value is the set of labels
// that attribute was classified as.
type Result map[string]LabelSet

// LabelSet is a set of unique label names.
type LabelSet map[string]struct{}

// MarshalJSON marshals the LabelSet into a JSON array of strings, where each
// string is the name of a label in the set.
func (l LabelSet) MarshalJSON() ([]byte, error) {
keys := make([]string, 0, len(l))
for k := range l {
keys = append(keys, k)
}
return json.Marshal(keys)
}

// Classification represents the classification of a data repository attribute.
type Classification struct {
// AttributePath is the full path of the data repository attribute
// (e.g. the column). Each element corresponds to a component, in increasing
// order of granularity (e.g. [database, schema, table, column]).
AttributePath []string `json:"attributePath"`
// Labels is the set of labels that the attribute was classified as.
Labels LabelSet `json:"labels"`
}
199 changes: 199 additions & 0 deletions classification/label.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,199 @@
package classification

import (
"embed"
"errors"
"fmt"
"io/fs"
"os"
"path/filepath"

"github.com/open-policy-agent/opa/ast"
log "github.com/sirupsen/logrus"
"gopkg.in/yaml.v3"
)

var (
//go:embed labels/*.rego labels/labels.yaml
predefinedLabelsFs embed.FS
)

// InvalidLabelsError is an error type that represents an error when one or
// more labels are invalid, e.g. they have invalid classification rules. The
// error contains a slice of errors that caused the error, which can be
// unwrapped to get the individual errors that caused the problems.
type InvalidLabelsError struct {
errs []error
}

// Unwrap returns the errors that caused the InvalidLabelsError.
func (e InvalidLabelsError) Unwrap() []error {
return e.errs
}

// Error returns a string representation of the InvalidLabelsError.
func (e InvalidLabelsError) Error() string {
return errors.Join(e.errs...).Error()
}

// Label represents a data classification label.
type Label struct {
// Name is the name of the label.
Name string `yaml:"name" json:"name"`
// Description is a brief description of the label.
Description string `yaml:"description" json:"description"`
// Tags are a list of arbitrary tags associated with the label.
Tags []string `yaml:"tags" json:"tags"`
// ClassificationRule is the compiled Rego classification rule used to
// classify data.
ClassificationRule *ast.Module `yaml:"-" json:"-"`
}

// NewLabel creates a new Label with the given name, description, classification
// rule, and tags. The classification rule is expected to be the raw Rego code
// that will be used to classify data. If the classification rule is invalid, an
// error is returned.
func NewLabel(name, description, classificationRule string, tags ...string) (Label, error) {
rule, err := parseRego(classificationRule)
if err != nil {
return Label{}, fmt.Errorf("error preparing classification rule for label %s: %w", name, err)
}
return Label{
Name: name,
Description: description,
Tags: tags,
ClassificationRule: rule,
}, nil
}

// GetPredefinedLabels loads and returns the predefined embedded labels and
// their classification rules. The labels are read from the embedded labels.yaml
// file and the classification rules are read from the embedded Rego files. If
// there is an error reading or unmarshalling the labels file, it is returned.
// If there are errors reading or parsing a classification rules for labels, the
// errors are aggregated into an InvalidLabelsError and returned, along with
// the labels that were successfully read. Note that this should not return an
// error in reality, as the embedded labels should always be valid. If it does,
// it indicates a problem with the embedded labels!
func GetPredefinedLabels() ([]Label, error) {
return getLabels("labels/labels.yaml", true)
}

// GetCustomLabels loads and returns the labels and their classification rules
// defined in the given labels yaml file. The labels are read from the file
// along with their classification rule Rego files (defined in the yaml). If
// there is an error unmarshalling the labels file, it is returned. If there are
// errors reading or parsing a classification rules for labels, the errors are
// aggregated into an InvalidLabelsError and returned, along with the labels
// that were successfully read.
func GetCustomLabels(labelsYamlFname string) ([]Label, error) {
path, err := filepath.Abs(labelsYamlFname)
if err != nil {
return nil, fmt.Errorf("error getting absolute path for labels yaml file %s: %w", labelsYamlFname, err)
}
return getLabels(path, false)
}

// getLabels reads the labels YAML file from the given path and returns the
// labels and their classification rules. If predefined is true, the labels are
// read from the embedded FS, otherwise they are read from the file system. If
// there is an error reading or unmarshalling the labels file, it is returned.
// If there are errors reading or parsing a classification rules for labels, the
// errors are aggregated into an InvalidLabelsError and returned, along with the
// labels that were successfully read.
func getLabels(path string, predefined bool) ([]Label, error) {
var (
labelsFs fs.ReadFileFS
labelsFname string
)
if predefined {
labelsFs = predefinedLabelsFs
labelsFname = path
} else {
labelsFs = os.DirFS(filepath.Dir(path)).(fs.ReadFileFS)
labelsFname = filepath.Base(path)
}
// Read and parse the labels yaml file.
yamlBytes, err := labelsFs.ReadFile(labelsFname)
if err != nil {
return nil, fmt.Errorf("error reading label yaml file %s", path)
}
type yamlLabel struct {
Label `yaml:",inline"`
Rule string `yaml:"rule"`
}
yamlLabels := make(map[string]yamlLabel)
if err := yaml.Unmarshal(yamlBytes, &yamlLabels); err != nil {
return nil, fmt.Errorf("error unmarshalling labels yaml: %w", err)
}
labels := make([]Label, 0, len(yamlLabels))
// Read each label's classification rule.
var errs []error
for name, lbl := range yamlLabels {
// The rule file for this label is either an absolute path, a relative
// path, or a predefined rule. We need to determine the fs to use to
// read the rule file.
var (
ruleFs fs.ReadFileFS
ruleFname string
)
if predefined {
// We're dealing with the predefined labels, therefore the rule FS
// is same embedded FS as the labels YAML file. However, we need to
// use the dir of the labels yaml file as the rule file root because
// this is the root of the embedded fs - it's a bit of a quirk with
// the embedded FS API.
ruleFname = filepath.Join(filepath.Dir(path), lbl.Rule)
ruleFs = labelsFs
} else {
ruleFname = filepath.Base(lbl.Rule)
if filepath.IsAbs(lbl.Rule) {
// The rule has an absolute path, so we need to create a new fs
// for the rule's directory.
ruleFs = os.DirFS(filepath.Dir(lbl.Rule)).(fs.ReadFileFS)
} else {
// The rule has a relative path, which is relative to the labels
// YAML file (as opposed to the current directory).
ruleFs = os.DirFS(
filepath.Join(
filepath.Dir(path),
filepath.Dir(lbl.Rule),
),
).(fs.ReadFileFS)
}
}
rule, err := readLabelRule(ruleFname, ruleFs)
if err != nil {
errs = append(errs, fmt.Errorf("error reading classification rule for label %s: %w", name, err))
continue
}
lbl.Name = name
lbl.ClassificationRule = rule
labels = append(labels, lbl.Label)
}
if len(errs) > 0 {
return labels, InvalidLabelsError{errs}
}
return labels, nil
}

func readLabelRule(fname string, labelFs fs.ReadFileFS) (*ast.Module, error) {
b, err := labelFs.ReadFile(fname)
if err != nil {
return nil, fmt.Errorf("error reading rego file %s: %w", fname, err)
}
rule, err := parseRego(string(b))
if err != nil {
return nil, fmt.Errorf("error parsing classification rule for file %s: %w", fname, err)
}
return rule, nil
}

func parseRego(code string) (*ast.Module, error) {
log.Tracef("classifier module code: '%s'", code)
module, err := ast.ParseModule("classifier", code)
if err != nil {
return nil, fmt.Errorf("error parsing rego code: %w", err)
}
return module, nil
}
Loading

0 comments on commit f1d5fc0

Please sign in to comment.