From f89527945332e3260947f57270872c001e41d367 Mon Sep 17 00:00:00 2001 From: DC3-TSD <12175126+DC3-DCCI@users.noreply.github.com> Date: Wed, 29 Nov 2023 13:48:55 -0500 Subject: [PATCH] Fix yara recursion bug #40. Allow FileObject to accept bytearrays. --- CHANGELOG.md | 4 ++ docs/ParserComponents.md | 6 +-- docs/ParserDevelopment.md | 2 +- mwcp/file_object.py | 13 +++-- mwcp/metadata.py | 15 +++--- mwcp/report.py | 14 +++++ mwcp/runner.py | 54 ++++++++++++------- mwcp/tests/conftest.py | 8 +++ mwcp/tests/test_runner.py | 28 ++++++++++ mwcp/tests/test_runner/SiblingDispatch.py | 47 ++++++++++++++++ .../yara_repo/sibling_dispatch.yara | 42 +++++++++++++++ setup.cfg | 1 - 12 files changed, 196 insertions(+), 38 deletions(-) create mode 100644 mwcp/tests/test_runner/SiblingDispatch.py create mode 100644 mwcp/tests/test_runner/yara_repo/sibling_dispatch.yara diff --git a/CHANGELOG.md b/CHANGELOG.md index 171673f..cc8a8ba 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,10 @@ All notable changes to this project will be documented in this file. ### Changed - Catch OSError from DateTime related constructs and raise as ConstructError +- `FileObject` objects now also accept bytearrays for the `file_data` field. + +### Fixed +- Fixed bug causing files to be unprocessed when using yara recursion with a dispatched file with a modified parent. (#40) ## [3.13.0] - 2023-07-17 diff --git a/docs/ParserComponents.md b/docs/ParserComponents.md index 8c71bb3..18476e3 100644 --- a/docs/ParserComponents.md +++ b/docs/ParserComponents.md @@ -113,10 +113,10 @@ def run(self): Extract metadata and implant from Foo Dropper. """ # Decrypt and report implant. - key = self._extract_rc4_key(self.file_object.file_data) + key = self._extract_rc4_key(self.file_object.data) if key: # Decrypt and dispatch implant. - implant_data = self._decrypt_implant(key, self.file_object.file_data) + implant_data = self._decrypt_implant(key, self.file_object.data) if implant_data: implant_file_object = FileObject(implant_data, description='Decrypted Implant') self.dispatcher.add(implant_file_object) @@ -139,7 +139,7 @@ need to be updated due to a new variant of the sample. Extract metadata and implant from Foo Dropper. """ # Decrypt and report implant. - key = self._extract_rc4_key(self.file_object.file_data) + key = self._extract_rc4_key(self.file_object.data) if key: # Report key. self.logger.info('Found the key!') diff --git a/docs/ParserDevelopment.md b/docs/ParserDevelopment.md index 418ec67..3f95ff4 100644 --- a/docs/ParserDevelopment.md +++ b/docs/ParserDevelopment.md @@ -355,7 +355,7 @@ class Dropper(Parser): Extract metadata and implant from Foo Dropper. """ # parse config - info = self.DECRYPT_CALL.parse(self.file_object.file_data, pe=self.file_object.pe) + info = self.DECRYPT_CALL.parse(self.file_object.data, pe=self.file_object.pe) config = info.config # report metadata diff --git a/mwcp/file_object.py b/mwcp/file_object.py index 6f0a4da..bf5dcad 100644 --- a/mwcp/file_object.py +++ b/mwcp/file_object.py @@ -56,7 +56,7 @@ class FileObject(object): def __init__( self, - file_data: bytes, + file_data: Union[bytes, bytearray], reporter=None, # DEPRECATED pe: pefile.PE = None, file_name=None, @@ -72,7 +72,7 @@ def __init__( """ Initializes the FileObject. - :param bytes file_data: Data for the file. + :param bytes/bytearray file_data: Data for the file. :param pefile.PE pe: PE object for the file. :param mwcp.Report reporter: MWCP Report. :param str file_name: File name to use if file is not a PE or use_supplied_fname was specified. @@ -94,9 +94,12 @@ def __init__( DeprecationWarning ) - # Ensure we are getting a bytes string. Libraries like pefile depend on this. - if not isinstance(file_data, bytes): - raise TypeError("file_data must be a bytes string.") + # Ensure we are getting a bytes string or bytearray. + # Convert bytearrays to bytes strings as libraries like pefile depend on this. + if isinstance(file_data, bytearray): + file_data = bytes(file_data) + elif not isinstance(file_data, bytes): + raise TypeError("file_data must be either a bytes string or bytearray.") self._file_path = file_path self._exists = bool(file_path) # Indicates if the user provided the path and the file exists on the host file system. diff --git a/mwcp/metadata.py b/mwcp/metadata.py index fb5221c..3d23c42 100644 --- a/mwcp/metadata.py +++ b/mwcp/metadata.py @@ -902,13 +902,12 @@ def as_formatted_dict(self, flat=False) -> dict: tags.extend(self.credentials.tags) actions = [] - if self.actions is not None: - for action in self.actions: - tags.extend(action.tags) - if action.cwd: - actions.append(f"{action.cwd}> {action.value}") - else: - actions.append(action.value) + for action in self.actions or []: + tags.extend(action.tags) + if action.cwd: + actions.append(f"{action.cwd}> {action.value}") + else: + actions.append(action.value) return { "tags": sorted(set(tags)), @@ -939,7 +938,7 @@ def as_stix(self, base_object, fixed_timestamp=None) -> STIXResult: result.add_linked(scheduled_task) result.create_tag_note(self, scheduled_task) - for action in self.actions: + for action in self.actions or []: action_obj = action.as_stix(base_object) result.merge(action_obj) result.add_unlinked(stix.Relationship( diff --git a/mwcp/report.py b/mwcp/report.py index c98cadb..d6a4533 100644 --- a/mwcp/report.py +++ b/mwcp/report.py @@ -222,6 +222,20 @@ def external_knowledge(self) -> dict: """Provides copy of the initial knowledge_base provided by the user.""" return dict(self._external_knowledge) # copy to prevent parser from modifying. + @property + def unidentified(self) -> List[FileObject]: + """The files that are unidentified.""" + from mwcp.dispatcher import UnidentifiedFile + if not self.input_file: + return [] + ret = [] + if self.input_file.parser == UnidentifiedFile: + ret.append(self.input_file) + for file_object in self.input_file.descendants: + if file_object.parser == UnidentifiedFile: + ret.append(file_object) + return ret + def get_logs(self, source: Optional[FileObject] = None, errors_only=False) -> List[str]: """ Gets log messages. diff --git a/mwcp/runner.py b/mwcp/runner.py index f5f9552..9a892ad 100644 --- a/mwcp/runner.py +++ b/mwcp/runner.py @@ -7,6 +7,7 @@ import pathlib import re import weakref +from collections import deque from typing import TYPE_CHECKING, Union, Type, Tuple, Iterable import yara @@ -17,7 +18,6 @@ from mwcp.dispatcher import Dispatcher from mwcp.report import Report from mwcp.registry import iter_parsers -from mwcp.dispatcher import UnidentifiedFile if TYPE_CHECKING: from mwcp import Parser @@ -148,7 +148,12 @@ def __init__( super().__init__(**report_config) self._rules = self.compile_rules(pathlib.Path(yara_repo)) self._recursive = recursive - self._attempted = set() # Keep track of files we have already attempted to parse. + self._queue = deque() + self._seen = set() + + def reset(self): + self._queue = deque() + self._seen = set() def compile_rules(self, yara_repo: pathlib.Path) -> yara.Rules: if not yara_repo.exists(): @@ -203,28 +208,33 @@ def iter_parsers(self, file_object: FileObject, parser: Union[str, Parser] = Non if not matched: logger.info(f"Found no YARA matches for {file_object.name}") + def _collect_unidentified(self, report: Report) -> Iterable[FileObject]: + """Collects new unidentified files since the last time this function was run.""" + for file_object in report.unidentified: + if file_object not in self._seen: + self._seen.add(file_object) + yield file_object + def _parse(self, input_file: FileObject, parsers: Iterable[Parser], report: Report): - self._attempted.add(input_file) super()._parse(input_file, parsers, report) - # After parsing the file, recursively process any undefined dispatched files. + + # After parsing the file, recursively add any new undefined dispatched files to the queue for processing. if self._recursive: - for child in input_file.descendants: - if child.parser == UnidentifiedFile and child not in self._attempted: - parsers = list(self.iter_parsers(child)) - if not parsers: - self._attempted.add(child) # Avoid running yara multiple times. - continue + for file_object in self._collect_unidentified(report): + parsers = list(self.iter_parsers(file_object)) + if not parsers: + continue - # Clear identification markings and try again. - child.parser = None - child.description = None + # Clear identification markings and try again. + file_object.parser = None + file_object.description = None - # Remove child from report. (It will get re-added when we parse.) - for file in report.get(metadata.File, source=child.parent): - if file.md5 == child.md5: - report.remove(file) + # Remove child from report. (It will get re-added when we parse.) + for file in report.get(metadata.File, source=file_object.parent): + if file.md5 == file_object.md5: + report.remove(file) - self._parse(child, parsers, report) + self._queue.appendleft((file_object, parsers)) def run( self, @@ -262,8 +272,12 @@ def run( with report, OutputLogger(): try: - parsers = self.iter_parsers(input_file, parser) - self._parse(input_file, parsers, report) + self.reset() + parsers = list(self.iter_parsers(input_file, parser)) + self._queue.appendleft((input_file, parsers)) + while self._queue: + file_object, parsers = self._queue.pop() + self._parse(file_object, parsers, report) return report finally: self._cleanup() diff --git a/mwcp/tests/conftest.py b/mwcp/tests/conftest.py index d31c1ab..6f8c7f4 100644 --- a/mwcp/tests/conftest.py +++ b/mwcp/tests/conftest.py @@ -285,3 +285,11 @@ def metadata_items() -> List[Metadata]: derivation="embedded" ), ] + + +def pytest_itemcollected(item): + """ + Automatically mark tests as "framework" if not marked as "parsers" + """ + if not any(marker.name == "parsers" for marker in item.iter_markers()): + item.add_marker("framework") diff --git a/mwcp/tests/test_runner.py b/mwcp/tests/test_runner.py index fc56be5..7695549 100644 --- a/mwcp/tests/test_runner.py +++ b/mwcp/tests/test_runner.py @@ -1,6 +1,7 @@ """ Tests mwcp.Runner components. """ +import textwrap import mwcp @@ -48,3 +49,30 @@ def test_yara_runner_recursive(datadir): assert report.input_file.description == "File A" residual_file = report.input_file.children[0] assert residual_file.description == "Unidentified file" + + +def test_yara_runner_sibling_dispatch(datadir): + """ + Tests Github issue #40 where a file doesn't get processed because + it was dispatched with a parent of an already processed sibling. + """ + mwcp.register_parser_directory(str(datadir), source_name="test") + + # Test running SingleDispatch parser and see if we successfully get the Grandchild to be parsed. + report = mwcp.run(data=b"matches parent", yara_repo=datadir / "yara_repo", recursive=True) + assert report + assert report.parser == "-" + input_file = report.input_file + assert input_file.description == "Parent" + children = input_file.children + assert len(children) == 2 + assert children[0].description == "Sibling 1" + assert children[1].description == "Sibling 2" + assert len(children[0].children) == 1 + # This was originally unidentified due to not being processed. + assert children[0].children[0].description == "Grandchild" + assert report.file_tree() == textwrap.dedent("""\ + <40b44905ee15a698e22f086c758a3981.bin (40b44905ee15a698e22f086c758a3981) : Parent> + ├── + │ └── <3ca5088d02dfb0fc668a0e2898ec3d93.bin (3ca5088d02dfb0fc668a0e2898ec3d93) : Grandchild> + └── """) diff --git a/mwcp/tests/test_runner/SiblingDispatch.py b/mwcp/tests/test_runner/SiblingDispatch.py new file mode 100644 index 0000000..1c41415 --- /dev/null +++ b/mwcp/tests/test_runner/SiblingDispatch.py @@ -0,0 +1,47 @@ +""" +Parsers for test_yara_runner_sibling_dispatch +""" + +from mwcp import Parser, FileObject + + +class Parent(Parser): + DESCRIPTION = "Parent" + + @classmethod + def identify(cls, file_object): + return b"parent" in file_object.data + + def run(self): + self.dispatcher.add(FileObject(b"sibling 1")) + self.dispatcher.add(FileObject(b"sibling 2")) + + +class Sibling1(Parser): + DESCRIPTION = "Sibling 1" + + @classmethod + def identify(cls, file_object): + return b"sibling 1" in file_object.data + + +class Sibling2(Parser): + DESCRIPTION = "Sibling 2" + + @classmethod + def identify(cls, file_object): + return b"sibling 2" in file_object.data + + def run(self): + # Testing corner case where we dispatch a file that is a parent of an already processed sibling. + sibling = self.file_object.siblings[0] + assert sibling.description == "Sibling 1" # sanity check + self.dispatcher.add(FileObject(b"grandchild"), parent=sibling) + + +class Grandchild(Parser): + DESCRIPTION = "Grandchild" + + @classmethod + def identify(cls, file_object): + return b"grandchild" in file_object.data diff --git a/mwcp/tests/test_runner/yara_repo/sibling_dispatch.yara b/mwcp/tests/test_runner/yara_repo/sibling_dispatch.yara new file mode 100644 index 0000000..1e87602 --- /dev/null +++ b/mwcp/tests/test_runner/yara_repo/sibling_dispatch.yara @@ -0,0 +1,42 @@ +/* +Rules for test_yara_runner_sibling_dispatch +*/ + +rule Parent { + meta: + mwcp = "SiblingDispatch.Parent" + strings: + $str = "parent" + condition: + all of them +} + + +rule Sibling1 { + meta: + mwcp = "SiblingDispatch.Sibling1" + strings: + $str = "sibling 1" + condition: + all of them +} + + +rule Sibling2 { + meta: + mwcp = "SiblingDispatch.Sibling2" + strings: + $str = "sibling 2" + condition: + all of them +} + + +rule Grandchild { + meta: + mwcp = "SiblingDispatch.Grandchild" + strings: + $str = "grandchild" + condition: + all of them +} diff --git a/setup.cfg b/setup.cfg index d43d6f4..2701380 100644 --- a/setup.cfg +++ b/setup.cfg @@ -7,7 +7,6 @@ long-description = file:README.md [tool:pytest] testpaths = mwcp/tests required_plugins = pytest-datadir pytest-xdist -pyargs = mwcp filterwarnings = ignore::DeprecationWarning addopts =