Skip to content

Commit

Permalink
Fix yara recursion bug #40. Allow FileObject to accept bytearrays.
Browse files Browse the repository at this point in the history
  • Loading branch information
dc3-tsd committed Dec 6, 2023
1 parent aa27546 commit f895279
Show file tree
Hide file tree
Showing 12 changed files with 196 additions and 38 deletions.
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,10 @@ All notable changes to this project will be documented in this file.

### Changed
- Catch OSError from DateTime related constructs and raise as ConstructError
- `FileObject` objects now also accept bytearrays for the `file_data` field.

### Fixed
- Fixed bug causing files to be unprocessed when using yara recursion with a dispatched file with a modified parent. (#40)


## [3.13.0] - 2023-07-17
Expand Down
6 changes: 3 additions & 3 deletions docs/ParserComponents.md
Original file line number Diff line number Diff line change
Expand Up @@ -113,10 +113,10 @@ def run(self):
Extract metadata and implant from Foo Dropper.
"""
# Decrypt and report implant.
key = self._extract_rc4_key(self.file_object.file_data)
key = self._extract_rc4_key(self.file_object.data)
if key:
# Decrypt and dispatch implant.
implant_data = self._decrypt_implant(key, self.file_object.file_data)
implant_data = self._decrypt_implant(key, self.file_object.data)
if implant_data:
implant_file_object = FileObject(implant_data, description='Decrypted Implant')
self.dispatcher.add(implant_file_object)
Expand All @@ -139,7 +139,7 @@ need to be updated due to a new variant of the sample.
Extract metadata and implant from Foo Dropper.
"""
# Decrypt and report implant.
key = self._extract_rc4_key(self.file_object.file_data)
key = self._extract_rc4_key(self.file_object.data)
if key:
# Report key.
self.logger.info('Found the key!')
Expand Down
2 changes: 1 addition & 1 deletion docs/ParserDevelopment.md
Original file line number Diff line number Diff line change
Expand Up @@ -355,7 +355,7 @@ class Dropper(Parser):
Extract metadata and implant from Foo Dropper.
"""
# parse config
info = self.DECRYPT_CALL.parse(self.file_object.file_data, pe=self.file_object.pe)
info = self.DECRYPT_CALL.parse(self.file_object.data, pe=self.file_object.pe)
config = info.config

# report metadata
Expand Down
13 changes: 8 additions & 5 deletions mwcp/file_object.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ class FileObject(object):

def __init__(
self,
file_data: bytes,
file_data: Union[bytes, bytearray],
reporter=None, # DEPRECATED
pe: pefile.PE = None,
file_name=None,
Expand All @@ -72,7 +72,7 @@ def __init__(
"""
Initializes the FileObject.
:param bytes file_data: Data for the file.
:param bytes/bytearray file_data: Data for the file.
:param pefile.PE pe: PE object for the file.
:param mwcp.Report reporter: MWCP Report.
:param str file_name: File name to use if file is not a PE or use_supplied_fname was specified.
Expand All @@ -94,9 +94,12 @@ def __init__(
DeprecationWarning
)

# Ensure we are getting a bytes string. Libraries like pefile depend on this.
if not isinstance(file_data, bytes):
raise TypeError("file_data must be a bytes string.")
# Ensure we are getting a bytes string or bytearray.
# Convert bytearrays to bytes strings as libraries like pefile depend on this.
if isinstance(file_data, bytearray):
file_data = bytes(file_data)
elif not isinstance(file_data, bytes):
raise TypeError("file_data must be either a bytes string or bytearray.")

self._file_path = file_path
self._exists = bool(file_path) # Indicates if the user provided the path and the file exists on the host file system.
Expand Down
15 changes: 7 additions & 8 deletions mwcp/metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -902,13 +902,12 @@ def as_formatted_dict(self, flat=False) -> dict:
tags.extend(self.credentials.tags)

actions = []
if self.actions is not None:
for action in self.actions:
tags.extend(action.tags)
if action.cwd:
actions.append(f"{action.cwd}> {action.value}")
else:
actions.append(action.value)
for action in self.actions or []:
tags.extend(action.tags)
if action.cwd:
actions.append(f"{action.cwd}> {action.value}")
else:
actions.append(action.value)

return {
"tags": sorted(set(tags)),
Expand Down Expand Up @@ -939,7 +938,7 @@ def as_stix(self, base_object, fixed_timestamp=None) -> STIXResult:
result.add_linked(scheduled_task)
result.create_tag_note(self, scheduled_task)

for action in self.actions:
for action in self.actions or []:
action_obj = action.as_stix(base_object)
result.merge(action_obj)
result.add_unlinked(stix.Relationship(
Expand Down
14 changes: 14 additions & 0 deletions mwcp/report.py
Original file line number Diff line number Diff line change
Expand Up @@ -222,6 +222,20 @@ def external_knowledge(self) -> dict:
"""Provides copy of the initial knowledge_base provided by the user."""
return dict(self._external_knowledge) # copy to prevent parser from modifying.

@property
def unidentified(self) -> List[FileObject]:
"""The files that are unidentified."""
from mwcp.dispatcher import UnidentifiedFile
if not self.input_file:
return []
ret = []
if self.input_file.parser == UnidentifiedFile:
ret.append(self.input_file)
for file_object in self.input_file.descendants:
if file_object.parser == UnidentifiedFile:
ret.append(file_object)
return ret

def get_logs(self, source: Optional[FileObject] = None, errors_only=False) -> List[str]:
"""
Gets log messages.
Expand Down
54 changes: 34 additions & 20 deletions mwcp/runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
import pathlib
import re
import weakref
from collections import deque
from typing import TYPE_CHECKING, Union, Type, Tuple, Iterable

import yara
Expand All @@ -17,7 +18,6 @@
from mwcp.dispatcher import Dispatcher
from mwcp.report import Report
from mwcp.registry import iter_parsers
from mwcp.dispatcher import UnidentifiedFile

if TYPE_CHECKING:
from mwcp import Parser
Expand Down Expand Up @@ -148,7 +148,12 @@ def __init__(
super().__init__(**report_config)
self._rules = self.compile_rules(pathlib.Path(yara_repo))
self._recursive = recursive
self._attempted = set() # Keep track of files we have already attempted to parse.
self._queue = deque()
self._seen = set()

def reset(self):
self._queue = deque()
self._seen = set()

def compile_rules(self, yara_repo: pathlib.Path) -> yara.Rules:
if not yara_repo.exists():
Expand Down Expand Up @@ -203,28 +208,33 @@ def iter_parsers(self, file_object: FileObject, parser: Union[str, Parser] = Non
if not matched:
logger.info(f"Found no YARA matches for {file_object.name}")

def _collect_unidentified(self, report: Report) -> Iterable[FileObject]:
"""Collects new unidentified files since the last time this function was run."""
for file_object in report.unidentified:
if file_object not in self._seen:
self._seen.add(file_object)
yield file_object

def _parse(self, input_file: FileObject, parsers: Iterable[Parser], report: Report):
self._attempted.add(input_file)
super()._parse(input_file, parsers, report)
# After parsing the file, recursively process any undefined dispatched files.

# After parsing the file, recursively add any new undefined dispatched files to the queue for processing.
if self._recursive:
for child in input_file.descendants:
if child.parser == UnidentifiedFile and child not in self._attempted:
parsers = list(self.iter_parsers(child))
if not parsers:
self._attempted.add(child) # Avoid running yara multiple times.
continue
for file_object in self._collect_unidentified(report):
parsers = list(self.iter_parsers(file_object))
if not parsers:
continue

# Clear identification markings and try again.
child.parser = None
child.description = None
# Clear identification markings and try again.
file_object.parser = None
file_object.description = None

# Remove child from report. (It will get re-added when we parse.)
for file in report.get(metadata.File, source=child.parent):
if file.md5 == child.md5:
report.remove(file)
# Remove child from report. (It will get re-added when we parse.)
for file in report.get(metadata.File, source=file_object.parent):
if file.md5 == file_object.md5:
report.remove(file)

self._parse(child, parsers, report)
self._queue.appendleft((file_object, parsers))

def run(
self,
Expand Down Expand Up @@ -262,8 +272,12 @@ def run(

with report, OutputLogger():
try:
parsers = self.iter_parsers(input_file, parser)
self._parse(input_file, parsers, report)
self.reset()
parsers = list(self.iter_parsers(input_file, parser))
self._queue.appendleft((input_file, parsers))
while self._queue:
file_object, parsers = self._queue.pop()
self._parse(file_object, parsers, report)
return report
finally:
self._cleanup()
8 changes: 8 additions & 0 deletions mwcp/tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -285,3 +285,11 @@ def metadata_items() -> List[Metadata]:
derivation="embedded"
),
]


def pytest_itemcollected(item):
"""
Automatically mark tests as "framework" if not marked as "parsers"
"""
if not any(marker.name == "parsers" for marker in item.iter_markers()):
item.add_marker("framework")
28 changes: 28 additions & 0 deletions mwcp/tests/test_runner.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
"""
Tests mwcp.Runner components.
"""
import textwrap

import mwcp

Expand Down Expand Up @@ -48,3 +49,30 @@ def test_yara_runner_recursive(datadir):
assert report.input_file.description == "File A"
residual_file = report.input_file.children[0]
assert residual_file.description == "Unidentified file"


def test_yara_runner_sibling_dispatch(datadir):
"""
Tests Github issue #40 where a file doesn't get processed because
it was dispatched with a parent of an already processed sibling.
"""
mwcp.register_parser_directory(str(datadir), source_name="test")

# Test running SingleDispatch parser and see if we successfully get the Grandchild to be parsed.
report = mwcp.run(data=b"matches parent", yara_repo=datadir / "yara_repo", recursive=True)
assert report
assert report.parser == "-"
input_file = report.input_file
assert input_file.description == "Parent"
children = input_file.children
assert len(children) == 2
assert children[0].description == "Sibling 1"
assert children[1].description == "Sibling 2"
assert len(children[0].children) == 1
# This was originally unidentified due to not being processed.
assert children[0].children[0].description == "Grandchild"
assert report.file_tree() == textwrap.dedent("""\
<40b44905ee15a698e22f086c758a3981.bin (40b44905ee15a698e22f086c758a3981) : Parent>
├── <efd40a513a2b00d7354756967ff6b683.bin (efd40a513a2b00d7354756967ff6b683) : Sibling 1>
│ └── <3ca5088d02dfb0fc668a0e2898ec3d93.bin (3ca5088d02dfb0fc668a0e2898ec3d93) : Grandchild>
└── <aaaa145ac48779f3eafdb0e521d15b94.bin (aaaa145ac48779f3eafdb0e521d15b94) : Sibling 2>""")
47 changes: 47 additions & 0 deletions mwcp/tests/test_runner/SiblingDispatch.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
"""
Parsers for test_yara_runner_sibling_dispatch
"""

from mwcp import Parser, FileObject


class Parent(Parser):
DESCRIPTION = "Parent"

@classmethod
def identify(cls, file_object):
return b"parent" in file_object.data

def run(self):
self.dispatcher.add(FileObject(b"sibling 1"))
self.dispatcher.add(FileObject(b"sibling 2"))


class Sibling1(Parser):
DESCRIPTION = "Sibling 1"

@classmethod
def identify(cls, file_object):
return b"sibling 1" in file_object.data


class Sibling2(Parser):
DESCRIPTION = "Sibling 2"

@classmethod
def identify(cls, file_object):
return b"sibling 2" in file_object.data

def run(self):
# Testing corner case where we dispatch a file that is a parent of an already processed sibling.
sibling = self.file_object.siblings[0]
assert sibling.description == "Sibling 1" # sanity check
self.dispatcher.add(FileObject(b"grandchild"), parent=sibling)


class Grandchild(Parser):
DESCRIPTION = "Grandchild"

@classmethod
def identify(cls, file_object):
return b"grandchild" in file_object.data
42 changes: 42 additions & 0 deletions mwcp/tests/test_runner/yara_repo/sibling_dispatch.yara
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
/*
Rules for test_yara_runner_sibling_dispatch
*/

rule Parent {
meta:
mwcp = "SiblingDispatch.Parent"
strings:
$str = "parent"
condition:
all of them
}


rule Sibling1 {
meta:
mwcp = "SiblingDispatch.Sibling1"
strings:
$str = "sibling 1"
condition:
all of them
}


rule Sibling2 {
meta:
mwcp = "SiblingDispatch.Sibling2"
strings:
$str = "sibling 2"
condition:
all of them
}


rule Grandchild {
meta:
mwcp = "SiblingDispatch.Grandchild"
strings:
$str = "grandchild"
condition:
all of them
}
1 change: 0 additions & 1 deletion setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@ long-description = file:README.md
[tool:pytest]
testpaths = mwcp/tests
required_plugins = pytest-datadir pytest-xdist
pyargs = mwcp
filterwarnings =
ignore::DeprecationWarning
addopts =
Expand Down

1 comment on commit f895279

@Playwright1
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Majikx0

Please sign in to comment.