Performance fixes; fixed duplicate datablock entries; mutli-processing support

This commit is contained in:
Robert MacGregor 2015-07-28 23:48:33 -04:00
parent 5cccabb0a3
commit 3b8fd518fe
3 changed files with 436 additions and 215 deletions

95
main.py Normal file
View file

@ -0,0 +1,95 @@
"""
main.py
"""
import re
import os
import sys
import multiprocessing
import importlib
import os.path
import timeit
import cProfile
import tsscraper
class Application(object):
thread_count = 8
threads = None
target_directory = None
target_exporter = None
def print_usage(self):
print("Usage: '%s <exporter> <target directories...>'" % sys.argv[0])
print("Or: '%s exporters' for a list of known exporters." % sys.argv[0])
def get_available_exporters(self):
exporters = { }
for root, dirs, files in os.walk("exporters"):
for filename in files:
module_name, extension = os.path.splitext(filename)
if (module_name == "__init__"):
continue
try:
module = importlib.import_module('exporters.%s' % (module_name))
exporters[module_name] = module
except ImportError as e:
print(e)
return exporters
def main(self):
"""
The main entry point of the application. This is equivalent to
the main() method in C and C++.
"""
if (len(sys.argv) < 2):
self.print_usage()
return
exporters = self.get_available_exporters()
if (sys.argv[1] == "exporters"):
print("Available Exporters: ")
for exporter in exporters.keys():
print("\t- %s" % exporter)
return
print("\t- None")
elif(len(sys.argv) < 3):
self.print_usage()
return
self.target_directory = sys.argv[2:]
self.target_exporter = sys.argv[1]
self.run()
def run(self):
exporter = None
if (self.target_exporter.lower() != "none"):
exporters = self.get_available_exporters()
try:
exporter = exporters[self.target_exporter]
except KeyError as e:
print("Error: No such exporter '%s'." % self.target_exporter)
self.print_usage()
return
scraper = tsscraper.TSScraper(self.target_directory, self.thread_count)
results = scraper.process()
# Init the DokuOutput
# if (exporter is not None):
# output = exporter.Exporter(results)
# output.write()
if __name__ == "__main__":
print("Operation Completion-----------------------\n%f Seconds" % timeit.timeit("Application().main()", number=1, setup="from __main__ import Application"))

7
setup.py Normal file
View file

@ -0,0 +1,7 @@
from distutils.core import setup
from Cython.Build import cythonize
setup(
name = 'Script Scraper',
ext_modules = cythonize("main.py"),
)

View file

@ -1,10 +1,19 @@
import re
import os
import sys
import multiprocessing
import importlib
import os.path
import timeit
import cProfile
class FileEntry(object):
"""
Class representing a file in the mod directory. This
contains all processed nodes within the file data.
"""
path = None
global_functions = None
bound_functions = None
@ -17,6 +26,10 @@ class FileEntry(object):
self.datablocks = [ ]
class Function(object):
"""
Class representing a Function entity in the game code tree
that the parse stage produces.
"""
name = None
parameters = None
type = None
@ -35,6 +48,10 @@ class Function(object):
self.type = type
class Global(object):
"""
Class representing a global variable. This is currently unused
in the coding.
"""
name = None
def __init__(self, name):
@ -44,6 +61,10 @@ class Global(object):
return "$%s" % self.name
class Datablock(object):
"""
Class representing a datablock entry. It contains the type, derived
datablock name, the datablock name itself and all assigned properties.
"""
name = None
type = None
derived = None
@ -62,10 +83,122 @@ class Datablock(object):
self.aliases = [ ]
self.properties = properties
self.filepath = filepath
def scrape_file(input):
"""
This method is a performance critical code segment in the scraper.
It is what performs the initial parsing step to produce a sort of
high level representation of the mod for later steps to process
and eventually output.
"""
filepath, parameter_split, combined_pattern = input
class Application(object):
key_value_pattern = re.compile("(?<!.)\s*[A-z]+\s*=\s*(\S+);")
global_usages = re.compile("\{.*\$[A-z]+(::([A-z]+))*?.*\}")
global_pattern = re.compile("(?<!.)\s*$[A-z]+(::([A-z]+))*?")
parameter_split = re.compile("\s*,\s*")
assignment_split = re.compile("\s*=\s*")
with open(filepath, "r") as handle:
result = FileEntry(filepath)
file_data = handle.read()
# Parse for all sequences now
for match in re.finditer(combined_pattern, file_data):
line = file_data[0:match.start()].count("\n") + 1
match_text = match.group(0).strip()
if (match_text[0:8] == "function"):
# :: Can't occur correctly in TS in just the function body, so we determine bound functions via the
# presence of ::
if ("::" in match_text):
match_split = match.group(0).strip()[9:].split("::")
type = match_split[0].lower()
match_split = match_split[1].split("(")
name = match_split[0].lower()
match_split = match_split[1].replace(")", "").split(",")
parameters = [ ]
for parameter in match_split:
if (parameter == ""):
continue
parameters.append(parameter.lstrip().rstrip())
result.bound_functions.setdefault(type, [])
result.bound_functions[type].append(Function(name, type, parameters, filepath, line))
else:
match_split = match.group(0).strip()[9:].split("(")
name = match_split[0].lower()
match_split = re.split(parameter_split, match_split[1].replace(")", ""))
parameters = [ ]
for parameter in match_split:
if (parameter == ""):
continue
parameters.append(parameter.strip())
else:
line = file_data[0:match.start()].count("\n") + 1
match_text = match.group(0).lstrip().rstrip()
header = match_text[0:match_text.find("{")]
type = header[10:header.find("(")].strip().lower()
name = header[header.find("(") + 1:header.find(")")].strip().lower()
# Inherited?
inherited = None
inheritor = header.find(":")
if (inheritor != -1):
inherited = header[inheritor + 1:].strip().lower()
# Blow through key, values
properties = { }
for property_match in re.finditer(key_value_pattern, match_text):
property_text = property_match.group(0)
key, value = re.split(assignment_split, property_text, 1)
key = key.lstrip().lower()
value = value.rstrip().rstrip(";")
# Global reference
if (value[0] == "$"):
value = Global(value[1:])
# String
elif (value[0] == "\""):
value = value[1:value.rfind("\"")]
# Numerics
else:
try:
value = float(value)
except ValueError as e:
# If this was raised, treat it as a string
pass
properties[key] = value
result.datablocks.append(Datablock(name, type, properties, filepath, line, inherited))
return result
class TSScraper(object):
_process_count = None
_target_directories = None
_dependencies = None
_combined_pattern = re.compile("(?<!.)\s*function\s+(([A-z]|_))+(::([A-z]|_)+)*\(\s*(%[A-z]+(\s*,\s*%[A-z]+)*)*\s*\)|((?<!.)\s*datablock\s+[A-z]+\s*\(\s*\S+\s*\)\s*(:\s*[A-z]+)?\s*(//.*)?\s*\{(\s|\S)*?\s*(?<!.)\};)")
bound_function_pattern = re.compile("(?<!.)\s*function\s+(([A-z]|_)+::)([A-z]|_)+\(\s*(%[A-z]+(\s*,\s*%[A-z]+)*)*\s*\)")
function_pattern = re.compile("(?<!.)\s*function\s+([A-z]|_)+\(\s*(%[A-z]+(\w*,\s*%[A-z]+)*)*\s*\)")
function_pattern = re.compile("(?<!.)\s*function\s+(([A-z]|_)+::)([A-z]|_)+\(\s*(%[A-z]+(\w*,\s*%[A-z]+)*)*\s*\)")
datablock_pattern = re.compile("(?<!.)\s*datablock\s+[A-z]+\s*\(\s*\S+\s*\)\s*(:\s*[A-z]+)?\s*(//.*)?\s*\{(\s|\S)*?\s*(?<!.)\};")
key_value_pattern = re.compile("(?<!.)\s*[A-z]+\s*=\s*(\S+);")
@ -79,12 +212,11 @@ class Application(object):
parameter_split = re.compile("\s*,\s*")
assignment_split = re.compile("\s*=\s*")
def print_usage(self):
print("Usage: '%s <target directory> <exporter>'" % sys.argv[0])
print("Or: '%s exporters' for a list of known exporters." % sys.argv[0])
# Tables for checking datablock data
datablock_reference_table = {
_log_lines = None
# Rules for verifying datablock information
_datablock_rules = {
"tracerprojectiledata": {
"references": ["splash", "explosion", "sound"],
"declared": [ ],
@ -94,8 +226,8 @@ class Application(object):
},
"shapebaseimagedata": {
"references": ["item", "projectile"],
"declared": ["projectiletype"],
"references": [ ],
"declared": [ ],
"checks": {
}
},
@ -103,7 +235,8 @@ class Application(object):
"itemdata": {
"references": [ ],
"declared": [ ],
"checks": { }
"checks": { "pickupradius": (lambda x: x > 0, "Items should have >= 1 pickup radius.")
}
},
"audioprofile": {
@ -327,68 +460,120 @@ class Application(object):
"declared": [ ],
"checks": { },
},
"effectprofile": {
"references": [ ],
"declared": [ ],
"checks": { },
},
"precipitationdata": {
"references": [ ],
"declared": [ ],
"checks": { },
},
"commandericondata": {
"references": [ ],
"declared": [ ],
"checks": { },
},
"missionmarkerdata": {
"references": [ ],
"declared": [ ],
"checks": { },
},
"particleemissiondummydata": {
"references": [ ],
"declared": [ ],
"checks": { },
},
"fireballatmospheredata": {
"references": [ ],
"declared": [ ],
"checks": { },
},
"audiodescription": {
"references": [ ],
"declared": [ ],
"checks": { },
},
"lightningdata": {
"references": [ ],
"declared": [ ],
"checks": { },
},
"audioenvironment": {
"references": [ ],
"declared": [ ],
"checks": { },
},
}
"""
TracerProjectileData:
splash
explosion
sound
ShapeBaseImageData:
item
projectile
projectileType == projectile.type
"""
def check_datablock_references(self, data, known_datablocks):
def __init__(self, target_directories, process_count = 0):
self._process_count = process_count
self._target_directories = target_directories
# For each file entry
for file in data:
# For each datablock
for datablock in file.datablocks:
if (datablock.type in self.datablock_reference_table):
# Flip through each reference in the table
for reference in self.datablock_reference_table[datablock.type]["references"]:
if (reference not in datablock.properties):
print("Reference Warning: %s datablock '%s' has no '%s' declaration! (Declaration in %s, line %u)" % (datablock.type, datablock.name, reference, datablock.filepath, datablock.line))
else:
if (datablock.properties[reference] not in known_datablocks.keys()):
print("Reference Warning: %s Datablock '%s' references '%s' in property '%s', which does not exist! (Declaration in %s, line %u)" % (datablock.type, datablock.name, datablock.properties[reference], reference, datablock.filepath, datablock.line))
# Check each declaration
for declaration in self.datablock_reference_table[datablock.type]["declared"]:
if (declaration not in datablock.properties):
print("Declaration Warning: %s Datablock '%s' required property '%s' not declared! (Declaration in %s, line %u)" % (datablock.type, datablock.name, declaration, datablock.filepath, datablock.line))
# Run custom checks
for check in self.datablock_reference_table[datablock.type]["checks"].keys():
# Is it declared?
if (check not in datablock.properties):
print("Property Warning: %s Datablock %s '%s' property not declared! (Declaration in %s, line %u)" % (datablock.type, datablock.name, check, datablock.filepath, datablock.line))
else:
method, message = self.datablock_reference_table[datablock.type]["checks"][check]
if (not method(datablock.properties[check])):
print("Property Warning (Datablock '%s', type %s. Declaration in %s, line %u): %s" % (datablock.name, datablock.type, datablock.filepath, datablock.line, message))
else:
print("Program Error: Unknown datablock type '%s'! This means the software does not know how to check this datablock. (Declaration in %s, line %u)" % (datablock.type, datablock.filepath, datablock.line))
self._log_lines = [ ]
def get_file_list(self, directory):
output = [ ]
previous_working_directory = os.getcwd()
os.chdir(directory)
for root, dirs, files in os.walk("."):
for filename in files:
relative_path = os.path.join(root, filename)
if (not os.path.isfile(relative_path)):
continue
absolute_path = os.path.realpath(relative_path)
# Only check TS files
name, extension = os.path.splitext(filename)
if (extension != ".cs"):
continue
def resolve_datablock_parents(self, data, known_datablocks):
# For each file entry
for file in data:
# For each datablock
for datablock in file.datablocks:
if (datablock.derived is not None and datablock.derived not in known_datablocks.keys()):
print("Warning: Datablock '%s' derives from non-existent parent '%s'! (Declaration in %s, line %u)" % (datablock.name, datablock.derived,datablock.filepath, datablock.line))
elif (datablock.derived is not None):
datablock.derived = known_datablocks[datablock.derived]
output.append((absolute_path, relative_path.lower()))
def process_data(self, data):
# Entries we've already processed
os.chdir(previous_working_directory)
return output
def _parse_stage(self, target_files):
results = None
if (self._process_count > 0):
# Create a list with all the required data for the multi-process
input = [ ]
for target_file in target_files:
input.append((target_file, self.parameter_split, self._combined_pattern))
pool = multiprocessing.Pool(processes=self._process_count)
results = pool.map(scrape_file, input)
else:
results = [ ]
for target_file in target_files:
results.append(scrape_file((target_file, self.parameter_split, self._combined_pattern)))
return results
def _declaration_stage(self, parse_results):
# Entries we've already processed
processed_entries = { }
# For each file entry
for file in data:
known_datablocks = { }
for file in parse_results:
# For each global function
for global_function in file.global_functions:
processed_entries.setdefault(global_function.name, global_function)
@ -434,9 +619,9 @@ class Application(object):
processed_entries = { }
# For each datablock
known_datablocks = { }
for datablock in file.datablocks:
processed_entries.setdefault(datablock.name, datablock)
known_datablocks.setdefault(datablock.name, [])
known_datablocks[datablock.name].append(datablock)
@ -454,157 +639,91 @@ class Application(object):
known_entry.aliases.append(datablock)
datablock.aliases.append(known_entry)
print("Warning: Datablock '%s' redeclared in %s, line %u! (Original declaration in %s, line %u" % (datablock.name, datablock.filepath, datablock.line, known_entry.filepath, known_entry.line))
return known_datablocks
def main(self):
# Load exporters
exporters = { }
for root, dirs, files in os.walk("exporters"):
for filename in files:
module_name, extension = os.path.splitext(filename)
if (module_name == "__init__"):
continue
try:
module = importlib.import_module('exporters.%s' % (module_name))
exporters[module_name] = module
except ImportError as e:
print(e)
if (len(sys.argv) < 2):
self.print_usage()
return
if (sys.argv[1] == "exporters"):
print("Available Exporters: ")
for exporter in exporters.keys():
print("\t- %s" % exporter)
return
elif(len(sys.argv) != 3):
self.print_usage()
return
exporter = None
try:
exporter = exporters[sys.argv[2]]
except KeyError as e:
print("Error: No such exporter '%s'." % sys.argv[2])
self.print_usage()
return
results = [ ]
global_aliases = { }
typed_aliases = { }
for root, dirs, files in os.walk(sys.argv[1]):
for filename in files:
filepath = os.path.join(root, filename)
if (not os.path.isfile(filepath)):
continue
# Only check TS files
name, extension = os.path.splitext(filepath)
if (extension != ".cs"):
continue
with open(filepath, "r") as handle:
file_entry = FileEntry(filepath)
file_data = handle.read()
# Grab Global function definitions
for match in re.finditer(self.function_pattern, file_data):
line = file_data[0:match.start()].count("\n") + 1
match_split = match.group(0).lstrip().rstrip().lstrip("function ").split("(")
name = match_split[0].lower()
match_split = re.split(self.parameter_split, match_split[1].replace(")", ""))
parameters = [ ]
for parameter in match_split:
if (parameter == ""):
continue
parameters.append(parameter.lstrip().rstrip())
file_entry.global_functions.append(Function(name, None, parameters, filepath, line))
# Grab bound function definitions
for match in re.finditer(self.bound_function_pattern, file_data):
line = file_data[0:match.start()].count("\n") + 1
match_split = match.group(0).lstrip().rstrip().lstrip("function ").split("::")
type = match_split[0].lower()
match_split = match_split[1].split("(")
name = match_split[0].lower()
match_split = match_split[1].replace(")", "").split(",")
parameters = [ ]
for parameter in match_split:
if (parameter == ""):
continue
parameters.append(parameter.lstrip().rstrip())
file_entry.bound_functions.setdefault(type, [])
file_entry.bound_functions[type].append(Function(name, type, parameters, filepath, line))
# Grab non-inherited DB definitions
for match in re.finditer(self.datablock_pattern, file_data):
line = file_data[0:match.start()].count("\n") + 1
match_text = match.group(0).lstrip().rstrip()
header = match_text[0:match_text.find("{")]
type = header[len("datablock") + 1:header.find("(")].lstrip().rstrip().lower()
name = header[header.find("(") + 1:header.find(")")].lstrip().rstrip().lower()
# Inherited?
inherited = None
inheritor = header.find(":")
if (inheritor != -1):
inherited = header[inheritor + 1:].lstrip().rstrip().lower()
# Blow through key, values
properties = { }
for property_match in re.finditer(self.key_value_pattern, match_text):
property_text = property_match.group(0)
key, value = re.split(self.assignment_split, property_text, 1)
key = key.lstrip().lower()
value = value.rstrip().rstrip(";")
# Global reference
if (value[0] == "$"):
value = Global(value[1:])
# String
elif (value[0] == "\""):
value = value[1:value.rfind("\"")]
# Numerics
else:
try:
value = float(value)
except ValueError as e:
# If this was raised, treat it as a string
pass
properties[key] = value
file_entry.datablocks.append(Datablock(name, type, properties, filepath, line, inherited))
# Stick in results
results.append(file_entry)
known_datablocks = self.process_data(results)
self.resolve_datablock_parents(results, known_datablocks)
self.check_datablock_references(results, known_datablocks)
# Init the DokuOutput
output = exporter.Exporter(results)
output.write()
def _inheritance_stage(self, parse_results, datablock_list):
# For each file entry
for file in parse_results:
# For each datablock
for datablock in file.datablocks:
if (datablock.derived is not None and datablock.derived.lower() not in datablock_list.keys()):
print("Warning: Datablock '%s' derives from non-existent parent '%s'! (Declaration in %s, line %u)" % (datablock.name, datablock.derived, datablock.filepath, datablock.line))
elif (datablock.derived is not None):
datablock.derived = datablock_list[datablock.derived]
if __name__ == "__main__":
Application().main()
def _reference_stage(self, parse_results, datablock_list):
# For each file entry
for file in parse_results:
# For each datablock
for datablock in file.datablocks:
if (datablock.type in self._datablock_rules):
# Flip through each reference in the table
for reference in self._datablock_rules[datablock.type]["references"]:
if (reference not in datablock.properties):
print("Reference Warning: %s datablock '%s' has no '%s' declaration! (Declaration in %s, line %u)" % (datablock.type, datablock.name, reference, datablock.filepath, datablock.line))
else:
if (datablock.properties[reference].lower() not in datablock_list.keys()):
print("Reference Warning: %s Datablock '%s' references '%s' in property '%s', which does not exist! (Declaration in %s, line %u)" % (datablock.type, datablock.name, datablock.properties[reference], reference, datablock.filepath, datablock.line))
# Check each declaration
for declaration in self._datablock_rules[datablock.type]["declared"]:
if (declaration not in datablock.properties):
print("Declaration Warning: %s Datablock '%s' required property '%s' not declared! (Declaration in %s, line %u)" % (datablock.type, datablock.name, declaration, datablock.filepath, datablock.line))
# Run custom checks
for check in self._datablock_rules[datablock.type]["checks"].keys():
# Is it declared?
if (check not in datablock.properties):
print("Property Warning: %s Datablock %s '%s' property not declared! (Declaration in %s, line %u)" % (datablock.type, datablock.name, check, datablock.filepath, datablock.line))
else:
method, message = self._datablock_rules[datablock.type]["checks"][check]
if (not method(datablock.properties[check])):
print("Property Warning (Datablock '%s', type %s. Declaration in %s, line %u): %s" % (datablock.name, datablock.type, datablock.filepath, datablock.line, message))
else:
print("Program Error: Unknown datablock type '%s'! This means the software does not know how to check this datablock. (Declaration in %s, line %u)" % (datablock.type, datablock.filepath, datablock.line))
def process(self):
result = None
# Process each directory sequentially
target_files = { }
for index, target_directory in enumerate(self._target_directories):
if (os.path.isdir(target_directory) is False):
raise IOError("No such directory to recurse (#%u): '%s'" % (index, target_directory))
print("INFO: Building file list for directory '%s' ..." % target_directory)
current_files = self.get_file_list(target_directory)
# Does a previous entry exist in the target file list?
for current_absolute_path, current_relative_path in current_files:
target_files[current_relative_path] = current_absolute_path
# Build the list now
target_file_list = [ ]
for current_relative_file in target_files.keys():
target_file_list.append(target_files[current_relative_file])
# Perform the initial parse
print("INFO: Performing parse stage ...")
parse_results = self._parse_stage(target_file_list)
# Perform the declaration analysis
print("INFO: Performing declaration analysis. ...")
datablock_list = self._declaration_stage(parse_results)
# Perform DB inheritance analysis
print("INFO: Performing datablock inheritance analysis ...")
self._inheritance_stage(parse_results, datablock_list)
# Perform DB reference analysis
print("INFO: Performing datablock reference analysis ...")
self._reference_stage(parse_results, datablock_list)
# We're done, return the results
print("INFO: Done.")
return result