Generic File Parser to extract features from Malicious Files.
A Single Library Parser to extract meta information,static analysis and detect macros within the files.
Source Code : Generic File Parser
- Clone the Repo
- Create a virutalenv
virtualenv pyenv
- Install the requirements.
pip install -r requirements.txt
Script Usage
(pyenv) admin@cuckoo:~/generic-parser$ python -h
usage: [-h] -f PATH [-s STORE] -y YARA -e EXTRACT [--version]
optional arguments:
-h, --help show this help message and exit
-f PATH, --path PATH File Absolute Path
-s STORE, --store STORE
Store to DB
-y YARA, --yara YARA Apply Yara Matcher
-e EXTRACT, --extract EXTRACT
Extract Features
--version show program's version number and exit
- PATH : This should point to the path of the malware file which you want to analyze.
- STORE : Enable this flag if you want to store in a database.
- YARA : Enable this flag to apply yara to match for suspicious indicators in the file.
- version : Shows the version of the tool.
- Ability to Identify the Decomposition module selected based on the mime-type.
- Apply PDF based decomposition to extract features from the pdf file.
- Apply Office based decomposition to extract features of office files.
- Web Based files are decomposed to get interesting strings etc.
- Yara is applied on the entire file to get interesting matches which can help in identifying suspicious behaviour.
Sample UseCase For PDF File:
python -f test_files/0007b52a37aef3c0cbfb96348b826fb42a48ea895fa4446ce76683fb5195f759 -y 1 -e 1
"access_time": 1530781000,
"device": 2049,
"entropy": 0.024414521113765863,
"features": {
"pdf_features": {
"comment": 1,
"comments": [
"indirectObjects": [],
"indirect_obj": 0,
"names": [],
"startXref": [],
"start_xref": 0,
"trailer": [],
"xref": 0,
"xreg": []
"file_name": "0007b52a37aef3c0cbfb96348b826fb42a48ea895fa4446ce76683fb5195f759",
"file_path": "test_files/0007b52a37aef3c0cbfb96348b826fb42a48ea895fa4446ce76683fb5195f759",
"file_size_not_multiple_8": 7,
"group_id_of_owner": 1000,
"inode_number": 5379227,
"macro": 1,
"magic_buffer": "PDF document, version 1.4",
"magic_info": "PDF document, version 1.4",
"md5": "57fb493d35f33901845bbe4612faae6c",
"meta_data_change_time": 1505571184,
"mime": "application/pdf",
"min_possible_file_size": 733.5831159053229,
"modification_time": 315426600,
"no_of_hard_links": 1,
"protection_bytes": 33256,
"sha1": "57fb493d35f33901845bbe4612faae6c",
"sha256": "e1de36178b189e54ecb88497745a9b49b7e4db1e",
"size": 30047,
"user_id_of_owner": 1000,
"yara": [
Sample Use case for PE32 File
python -f test_files/07041a3c64fea7dd888220c87ce090aa6d29c92d75ea9fce1b1d3ec98ff64cd8 -y 1 -e 1
"access_time": 1530781850,
"device": 2049,
"entropy": 0.12669530516464111,
"features": {
"pe_features": {
"anti_debugging_capabilities": [],
"anti_vm_capabilities": [],
"check_sum": 0,
"compile_date": 1398238638,
"datadir_IMAGE_DIRECTORY_ENTRY_IAT_size": 196,
"debug_size": 0,
"export_size": 0,
"generated_check_sum": 984057,
"iat_rva": 28868,
"import_bound_symbols": [],
"import_symbols": [
"imported_symbols": [
"major_version": 4,
"minor_version": 0,
"number_of_bound_import_symbols": -1,
"number_of_bound_imports": -1,
"number_of_export_symbols": -1,
"number_of_import_symbols": 45,
"number_of_imports": 4,
"number_of_rva_and_sizes": 16,
"number_of_sections": 4,
"pe_char": 15,
"pe_dll": 0,
"pe_driver": 0,
"pe_exe": 1,
"pe_i386": 1,
"pe_majorlink": 18,
"pe_minorlink": 8,
"pe_warning_strings": [
"Invalid relocation information. SizeOfBlock too large: 3431661568",
"Corrupt header \"IMAGE_LOAD_CONFIG_DIRECTORY\" at file offset 68096. Exception: 'Data length less than expected header length.'"
"pe_warnings": 1,
"sec_entropy_code": 1.0860475014720217,
"sec_entropy_data": 6.024395015352624,
"sec_entropy_r1": 0.0,
"sec_entropy_rdata": -1,
"sec_entropy_reloc": -1,
"sec_entropy_rsrc": 4.653008448519358,
"sec_entropy_text": -1,
"sec_raw_execsize": 124400,
"sec_rawptr_code": 1024,
"sec_rawptr_data": 6656,
"sec_rawptr_r1": 30720,
"sec_rawptr_rsrc": 16384,
"sec_rawptr_text": -1,
"sec_rawsize_code": 5632,
"sec_rawsize_data": 9728,
"sec_rawsize_r1": 4608,
"sec_rawsize_rsrc": 14336,
"sec_rawsize_text": -1,
"sec_va_execsize": 34304,
"sec_vasize_code": 20992,
"sec_vasize_data": 82432,
"sec_vasize_r1": 4592,
"sec_vasize_rsrc": 16384,
"sec_vasize_text": -1,
"size_code": 20480,
"size_image": 135664,
"size_initdata": 86016,
"size_uninit": 438272,
"std_section_names": 0,
"total_size_pe": 936352,
"virtual_address": 4096,
"virtual_size": 20992,
"virtual_size_2": 82432
"pe_rare_features": {
"imported_symbols": -1,
"pe_warning_strings": -1,
"section_names": [
"file_name": "07041a3c64fea7dd888220c87ce090aa6d29c92d75ea9fce1b1d3ec98ff64cd8",
"file_path": "test_files/07041a3c64fea7dd888220c87ce090aa6d29c92d75ea9fce1b1d3ec98ff64cd8",
"file_size_not_multiple_8": 0,
"group_id_of_owner": 1000,
"inode_number": 5379220,
"macro": 1,
"magic_buffer": "PE32 executable (GUI) Intel 80386, for MS Windows",
"magic_info": "PE32 executable (GUI) Intel 80386, for MS Windows",
"md5": "c7aca54886e13e3bc79a1ec4c94e7518",
"meta_data_change_time": 1505571184,
"mime": "application/x-dosexec",
"min_possible_file_size": 118631.40238152204,
"modification_time": 315426600,
"no_of_hard_links": 1,
"protection_bytes": 33256,
"sha1": "c7aca54886e13e3bc79a1ec4c94e7518",
"sha256": "96d2e1d4a451636a5997fa1e6e8f18969134004f",
"size": 936352,
"user_id_of_owner": 1000,
"yara": [
Leave a Comment