T2-EngineScraper/scraper.py
2016-01-25 21:26:16 -05:00

482 lines
18 KiB
Python

"""
scraper.py
The scraper class is used to scrape data from a decompiled form of the
Tribes 2 game executable in order to build a tree of sorts that can be
used for mapping out the various functions, global variables and
datablock types & their associated properties.
This software is licensed under the MIT license. Refer to LICENSE.txt for
details.
Copyright (c) 2016 Robert MacGregor
"""
import re
import string
class EngineComponent(object):
"""
The base representation type for all the data the scraper will be
pulling from the pseudo source code.
"""
name = None
address = None
type_name = None
description = None
def __init__(self, name, address, type_name, description):
self.name = name
self.address = address
self.type_name = type_name
self.description = description
class Function(EngineComponent):
"""
The virtual representation of a callable engine function from Torque
Script. It contains a description, the address, argument information
and if applicable, the object typename it is bound to.
"""
min_args = None
max_args = None
def __init__(self, name, address, type_name, description, min_args, max_args):
EngineComponent.__init__(self, name, address, type_name, description)
self.min_args = min_args
self.max_args = max_args
class GlobalVariable(EngineComponent):
def __init__(self, name, address, type_name):
EngineComponent.__init__(self, name, address, type_name, None)
class Datablock(EngineComponent):
"""
The virtual representation of the Torque Game Engine datablock used
for synchronization of custom simulation parameters across the network.
"""
properties = None
class Property(EngineComponent):
def __init__(self, name, address, type_name):
EngineComponent.__init__(self, name, address, type_name, None)
def __init__(self, name):
EngineComponent.__init__(self, name, None, None, None)
self.properties = { }
class Scraper(object):
"""
The meat and potatoes of the scraper system. This is your primary
class to instantiate
"""
_global_function_registry = ["426650", "426590", "4265D0", "426550", "426610"]
"""
The global function registry is used by the scraper to determine prefixes
of all the sub routines that register global functions in the Tribes 2
engine. They take the following format: sub_######
"""
_type_function_registry = ["426450", "426510", "426450", "425960"]
"""
The type function registry is the same as the global function registry,
except used for type contextual functions. The registration signatures
for these functions are slightly different.
"""
_datablock_property_registry = ["423F20"]
"""
Registration subroutines regarding static fields of datablocks.
"""
_global_value_registry = ["4263B0"]
"""
Registration subroutines regarding globally addressible variables.
"""
_registration_expression_template = "sub_(%s)[^;{]+;[^\"]"
"""
Base regular expression used for matching the various registration
calls. It is formatted with the above values in-place for operating
within the different contexts.
"""
_datablock_type_table = {
"61E7A0": "ExplosionData",
"5B4F60": "WaterBlockData",
"612400": "WheeledVehicleData",
"6161E0": "HoverVehicleData",
"5CE810": "PlayerData",
"6034C0": "ItemData",
"69C170": "TriggerData",
"50DC70": "AudioProfileData",
"62B3C0": "LinearProjectile",
"60F820": "FlyingVehicleData",
"6370D0": "SeekingProjectileData",
"69B0F0": "PrecipitationData",
"641480": "SniperProjectileData",
"66A270": "SensorData",
"6303F0": "GrenadeProjectileData",
"6333D0": "GrenadeProjectileData",
"694B40": "TracerProjectileData",
"6470D0": "TargetProjectileData",
"653E10": "TurretData",
"654AE0": "TurretData",
"5E4C20": "TurretData", # Camera?
"654330": "TurretImageData", # TurretData?
"64E2B0": "LightningData",
"627150": "LightningData",
"621DF0": "ParticleEmitterData",
"622E60": "ParticleData",
"644910": "ELFProjectileData",
"64A860": "ELFProjectileData",
"5F4D90": "ShapeBaseImageData",
"602940": "StaticShapeData",
"66B000": "SpawnSphere",
"6099E0": "VehicleData",
"47D880": "AI Task?",
"63D870": "LinearFlareData",
"59A870": "TerrainData",
"68C4B0": "ShockwaveData",
"4B5840": "CorpseData",
"619B30": "Sky",
"5AB310": "Sky",
"68AAA0": "PhysicalZone",
"626240": "Debris",
"684000": "Debris",
"6751A0": "ForceFieldBareData",
"631A50": "ProjectileData", # Base?
"69AF10": "FireballAtmosphere",
}
"""
The datablock type table is used when looking up the context of a given static datablock field registration
call. This context is merely the address of the calling subroutine, so multiple entries may have to be added
for a single datablock type.
"""
# Hacks
string_expression = re.compile("\" *\S+\" *")
# Global method material
global_method_add_expression = re.compile(_registration_expression_template % string.join(_global_function_registry, "|"), re.IGNORECASE)
type_method_add_expression = re.compile(_registration_expression_template % string.join(_type_function_registry, "|"), re.IGNORECASE)
datablock_property_add_expression = re.compile(_registration_expression_template % string.join(_datablock_property_registry, "|"), re.IGNORECASE)
global_value_add_expression = re.compile(_registration_expression_template % string.join(_global_value_registry, "|"), re.IGNORECASE)
type_function_total = 0
global_function_count = 0
type_function_counts = None
primitive_type_mapping = [
"Unknown",
"Integer",
"Unknown",
"Boolean",
"Unknown",
"Float",
"Unknown"
]
# Dictionary containing type name to inheritance list mappings
type_name_inheritance = {
"HTTPObject": "TCPObject",
"TCPObject": "SimObject",
"FileObject": "SimObject",
"Item": "ShapeBase",
"ShapeBase": "GameBase",
"GameBase": "SceneObject",
"SceneObject": "NetObject",
"NetObject": "SimObject",
"Player": "ShapeBase",
"DebugView": "GuiTextCtrl",
"GuiTextCtrl": "GuiControl",
"GuiControl": "SimGroup",
"SimGroup": "SimSet",
"SimSet": "SimObject",
"Canvas": "GuiCanvas",
"GuiCanvas": "GuiControl",
"SimpleNetObject": "SimObject",
"AIObjectiveQ": "SimSet",
"ForceFieldBare": "GameBase",
"AIConnection": "GameConnection",
"GameConnection": "NetConnection",
"NetConnection": "SimGroup",
"Turret": "StaticShape",
"StaticShape": "ShapeBase",
"TerrainBlock": "SceneObject",
"InteriorInstance": "SceneObject",
"StaticShape": "ShapeBase",
"Trigger": "GameBase",
"WaterBlock": "SceneObject",
"FireballAtmosphere": "GameBase",
"MissionArea": "NetObject",
"TSStatic": "SceneObject",
# Projectile Types
"LinearProjectile": "Projectile",
"Projectile": "GameBase",
"EnergyProjectile": "GrenadeProjectile",
"GrenadeProjectile": "Projectile",
"TargetProjectile": "Projectile",
# Vehicle Types
"HoverVehicle": "Vehicle",
"Vehicle": "ShapeBase",
"FlyingVehicle": "Vehicle",
"WheeledVehicle": "Vehicle",
# Datablock Types
"HoverVehicleData": "VehicleData",
"VehicleData": "ShapeBaseData",
"FlyingVehicleData": "VehicleData",
"WheeledVehicleData": "VehicleData",
"ForceFieldBareData": "GameBaseData",
"LinearProjectileData": "ProjectileData",
"EnergyProjectileData": "GrenadeProjectileData",
"GrenadeProjectileData": "ProjectileData",
"FireballAtmosphereData": "GameBaseData",
"TargetProjectileData": "ProjectileData",
"PlayerData": "ShapeBaseData",
"ShapeBaseData": "GameBaseData",
"GameBaseData": "SimDataBlock",
"SimDataBlock": "SimObject",
}
# Outputs
global_functions = None
type_methods = None
global_values = None
datablocks = None
def __init__(self, filename):
file_buffer = ""
with open(filename, "r") as handle:
file_buffer = handle.read()
# First, we skip the first 33350 or so because there's lots of declarations
# that the simplified regex will get tripped up on.
chopped_lines = file_buffer.split("\r\n")
chopped_lines = chopped_lines[33350:len(chopped_lines)]
file_buffer = string.join(chopped_lines)
"""
Now we perform a bit of a hack here because of unnecessary immutable
memory bullshit: Strings in Python are immutable and due to the way
the Regex works (can probably be fixed properly at some point),
methods that have a semicolon in their description (most do) will cause
the regex to match up until that semicolon, not the one that actually
delineates the entire method. So as a quick hack, we create a mutable
memory buffer (just a list) to do single character replacements of ;
with ~ within the context of strings. We can't simply use replace or any
of the regular string modification methods because they create copies of
the string memory which bogs down the system massively at this point: times
went down from an absolute unknown to merely ~2sec to run the entirety of this
software using this work around.
"""
mutable_buffer = list(file_buffer)
string_search = re.finditer(self.string_expression, file_buffer)
for string_occurrence in string_search:
string_text = string_occurrence.group(0)
for semi_occurrence in range(string_text.count(";")):
semi_location = string_text.find(";", semi_occurrence)
mutable_buffer[string_occurrence.start() + semi_location] = "~"
# Implode the list together using "" as a delineator, so it just reassembles the payload
file_buffer = string.join(mutable_buffer, "")
# A list of tuples with the following structure: (addr, name, desc, minArgs, maxArgs)
self.global_functions = [ ]
global_method_add_search = re.finditer(self.global_method_add_expression, file_buffer)
for global_function in global_method_add_search:
global_function_source = global_function.group(0)
opening_index = global_function_source.find("(")
closing_index = global_function_source.rfind(")", global_function_source.count(")") - 1)
global_function_source = global_function_source[opening_index + 1:closing_index]
# Extract the description first; this is a huge hack due to the commas in the desc
global_function_source, global_method_description = self._extract_description(global_function_source)
global_method_arguments = global_function_source.split(",")
# Strip out the global method info
global_method_name = self._extract_name(global_method_arguments, 0)
try:
global_method_address = self._extract_address(global_method_arguments, 1)
global_method_minargs = int(global_method_arguments[3])
global_method_maxargs = int(global_method_arguments[4])
self.global_function_count = self.global_function_count + 1
global_function = Function(global_method_name, global_method_address, None, global_method_description, global_method_minargs, global_method_maxargs)
self.global_functions.append(global_function)
except ValueError:
pass
# A dictionary of classname to tuples with the following structure: (typename, addr, name, desc, minArgs, maxArgs)
self.type_methods = { }
self.type_function_counts = { }
type_method_add_search = re.finditer(self.type_method_add_expression, file_buffer)
for type_method in type_method_add_search:
type_method_source = type_method.group(0)
opening_index = type_method_source.find("(")
closing_index = type_method_source.rfind(")")
type_method_source = type_method_source[opening_index + 1:closing_index]
# Extract the description first; this is a huge hack due to the commas in the desc
type_method_source, type_method_description = self._extract_description(type_method_source)
type_method_arguments = type_method_source.split(",")
# Strip out the type method info
type_method_type = self._extract_name(type_method_arguments, 1)
type_method_name = self._extract_name(type_method_arguments, 2)
type_method_address = self._extract_address(type_method_arguments, 3)
try:
type_method_minargs = int(type_method_arguments[5])
type_method_maxargs = int(type_method_arguments[6])
self.type_methods.setdefault(type_method_type, [])
self.type_function_counts.setdefault(type_method_type, 0)
self.type_function_total = self.type_function_total + 1
self.type_function_counts[type_method_type] = self.type_function_counts[type_method_type] + 1
self.type_methods[type_method_type] .append((type_method_type, type_method_address, type_method_name, type_method_description, type_method_minargs, type_method_maxargs))
except ValueError:
continue
self.global_values = [ ]
global_value_add_search = re.finditer(self.global_value_add_expression, file_buffer)
for global_value in global_value_add_search:
global_value_source = global_value.group(0)
opening_index = global_value_source.find("(")
closing_index = global_value_source.rfind(")")
global_value_source = global_value_source[opening_index + 1:closing_index]
global_value_arguments = global_value_source.split(",")
# Strip out the global value info
global_value_name = self._extract_name(global_value_arguments, 0)
global_value_address = self._extract_address(global_value_arguments, 2)
global_value_type = int(global_value_arguments[1])
self.global_values.append(GlobalVariable(global_value_address, global_value_type, 0))
# Extract the datablock properties now
self.datablocks = { }
datablock_property_add_search = re.finditer(self.datablock_property_add_expression, file_buffer)
for datablock_property in datablock_property_add_search:
datablock_property_source = datablock_property.group(0)
"""
Here we don't have to worry about anything with their own scopes
sitting above our declarations in the input file this was built for,
so we just search backwards for the type declaration (Which is always an int)
and use that to copy out the declaration source.
"""
declaration_start = file_buffer.rfind("//----- ", 0, datablock_property.start())
declaration_end = file_buffer.rfind("-", declaration_start, datablock_property.start())
declaration_source = file_buffer[declaration_start:declaration_end]
calling_method = self._extract_caller(declaration_source)
# If we don't know what it is, just add a default value and resolve it this way
self._datablock_type_table.setdefault(calling_method, calling_method)
datablock_type = self._datablock_type_table[calling_method]
self.datablocks.setdefault(datablock_type, Datablock(datablock_type))
# Pull the datablock property information now
datablock_arguments = datablock_property_source.split(",")
datablock_property_name = self._extract_name(datablock_arguments, 0)
datablock_property_address = self._extract_address(datablock_arguments, 2)
# Write it out and we should be fine.
current_datablock = self.datablocks[datablock_type]
current_datablock.properties[datablock_property_name] = Datablock.Property(datablock_property_name, datablock_property_address, "Bla")
def build_inheritance_tree(self, typename):
result = [ typename ]
while (typename in self.type_name_inheritance.keys()):
typename = self.type_name_inheritance[typename]
result.append(typename)
return result
# Helper Functions
def _extract_description(self, source):
desc_end = source.rfind("\"")
# We found the end, now we need to look for the previous parameter delineator
desc_begin = -1
ignore_delineator = True # Used for if we're in a quotation
for index in reversed(range(desc_end)):
current_character = source[index]
if (current_character == "," and not ignore_delineator):
desc_begin = index + 1
break
elif (current_character == "\""):
ignore_delineator = not ignore_delineator
desc = source[desc_begin + 1:desc_end]
desc = desc.lstrip()
desc = desc.replace("(int)\"", "")
source = source[0:desc_begin] + source[desc_end:len(source)]
desc = desc.replace("~", ";")
return source, desc
def _extract_name(self, source, index):
name = source[index].lstrip()
name = name[name.find("\"") + 1:len(name)].rstrip("\" ")
# Hack fix for the way the engine registers functions for the Sky type
name = name.replace("(int)&off_7957AC", "Sky")
return name
def _extract_address(self, source, index):
address = source[index]
address = address[address.find("_") + 1:len(address)].rstrip("\" ")
return address.lstrip()
def _extract_caller(self, source):
start = source.find("(")
end = source.find(")", start)
result = int(source[start + 1:end],16)
return hex(result)[2:].upper()