From 3b8fd518fece5375105a4999b309ad08fc34c173 Mon Sep 17 00:00:00 2001 From: Robert MacGregor Date: Tue, 28 Jul 2015 23:48:33 -0400 Subject: [PATCH] Performance fixes; fixed duplicate datablock entries; mutli-processing support --- main.py | 95 ++++++ setup.py | 7 + scriptScrape.py => tsscraper.py | 549 +++++++++++++++++++------------- 3 files changed, 436 insertions(+), 215 deletions(-) create mode 100644 main.py create mode 100644 setup.py rename scriptScrape.py => tsscraper.py (58%) diff --git a/main.py b/main.py new file mode 100644 index 0000000..e3f080a --- /dev/null +++ b/main.py @@ -0,0 +1,95 @@ +""" + main.py +""" + +import re +import os +import sys +import multiprocessing +import importlib +import os.path +import timeit + +import cProfile + +import tsscraper + +class Application(object): + thread_count = 8 + + threads = None + + target_directory = None + target_exporter = None + + def print_usage(self): + print("Usage: '%s '" % sys.argv[0]) + print("Or: '%s exporters' for a list of known exporters." % sys.argv[0]) + + def get_available_exporters(self): + exporters = { } + + for root, dirs, files in os.walk("exporters"): + for filename in files: + module_name, extension = os.path.splitext(filename) + + if (module_name == "__init__"): + continue + + try: + module = importlib.import_module('exporters.%s' % (module_name)) + exporters[module_name] = module + except ImportError as e: + print(e) + + return exporters + + def main(self): + """ + The main entry point of the application. This is equivalent to + the main() method in C and C++. + """ + if (len(sys.argv) < 2): + self.print_usage() + return + + exporters = self.get_available_exporters() + + if (sys.argv[1] == "exporters"): + print("Available Exporters: ") + + for exporter in exporters.keys(): + print("\t- %s" % exporter) + return + print("\t- None") + + elif(len(sys.argv) < 3): + self.print_usage() + return + + self.target_directory = sys.argv[2:] + self.target_exporter = sys.argv[1] + self.run() + + def run(self): + exporter = None + if (self.target_exporter.lower() != "none"): + exporters = self.get_available_exporters() + try: + exporter = exporters[self.target_exporter] + except KeyError as e: + print("Error: No such exporter '%s'." % self.target_exporter) + self.print_usage() + return + + scraper = tsscraper.TSScraper(self.target_directory, self.thread_count) + results = scraper.process() + + + # Init the DokuOutput + # if (exporter is not None): + # output = exporter.Exporter(results) + # output.write() + +if __name__ == "__main__": + print("Operation Completion-----------------------\n%f Seconds" % timeit.timeit("Application().main()", number=1, setup="from __main__ import Application")) \ No newline at end of file diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..59d3f87 --- /dev/null +++ b/setup.py @@ -0,0 +1,7 @@ +from distutils.core import setup +from Cython.Build import cythonize + +setup( + name = 'Script Scraper', + ext_modules = cythonize("main.py"), +) \ No newline at end of file diff --git a/scriptScrape.py b/tsscraper.py similarity index 58% rename from scriptScrape.py rename to tsscraper.py index 2054b3a..f44aff9 100644 --- a/scriptScrape.py +++ b/tsscraper.py @@ -1,10 +1,19 @@ import re import os import sys +import multiprocessing import importlib import os.path +import timeit + +import cProfile + class FileEntry(object): + """ + Class representing a file in the mod directory. This + contains all processed nodes within the file data. + """ path = None global_functions = None bound_functions = None @@ -17,6 +26,10 @@ class FileEntry(object): self.datablocks = [ ] class Function(object): + """ + Class representing a Function entity in the game code tree + that the parse stage produces. + """ name = None parameters = None type = None @@ -35,6 +48,10 @@ class Function(object): self.type = type class Global(object): + """ + Class representing a global variable. This is currently unused + in the coding. + """ name = None def __init__(self, name): @@ -44,6 +61,10 @@ class Global(object): return "$%s" % self.name class Datablock(object): + """ + Class representing a datablock entry. It contains the type, derived + datablock name, the datablock name itself and all assigned properties. + """ name = None type = None derived = None @@ -62,10 +83,122 @@ class Datablock(object): self.aliases = [ ] self.properties = properties self.filepath = filepath + +def scrape_file(input): + """ + This method is a performance critical code segment in the scraper. + It is what performs the initial parsing step to produce a sort of + high level representation of the mod for later steps to process + and eventually output. + """ + filepath, parameter_split, combined_pattern = input -class Application(object): + key_value_pattern = re.compile("(? '" % sys.argv[0]) - print("Or: '%s exporters' for a list of known exporters." % sys.argv[0]) - # Tables for checking datablock data - datablock_reference_table = { + _log_lines = None + + # Rules for verifying datablock information + _datablock_rules = { "tracerprojectiledata": { "references": ["splash", "explosion", "sound"], "declared": [ ], @@ -94,8 +226,8 @@ class Application(object): }, "shapebaseimagedata": { - "references": ["item", "projectile"], - "declared": ["projectiletype"], + "references": [ ], + "declared": [ ], "checks": { } }, @@ -103,7 +235,8 @@ class Application(object): "itemdata": { "references": [ ], "declared": [ ], - "checks": { } + "checks": { "pickupradius": (lambda x: x > 0, "Items should have >= 1 pickup radius.") + } }, "audioprofile": { @@ -327,68 +460,120 @@ class Application(object): "declared": [ ], "checks": { }, }, + + "effectprofile": { + "references": [ ], + "declared": [ ], + "checks": { }, + }, + + "precipitationdata": { + "references": [ ], + "declared": [ ], + "checks": { }, + }, + + "commandericondata": { + "references": [ ], + "declared": [ ], + "checks": { }, + }, + + "missionmarkerdata": { + "references": [ ], + "declared": [ ], + "checks": { }, + }, + + "particleemissiondummydata": { + "references": [ ], + "declared": [ ], + "checks": { }, + }, + + "fireballatmospheredata": { + "references": [ ], + "declared": [ ], + "checks": { }, + }, + + "audiodescription": { + "references": [ ], + "declared": [ ], + "checks": { }, + }, + + + "lightningdata": { + "references": [ ], + "declared": [ ], + "checks": { }, + }, + + "audioenvironment": { + "references": [ ], + "declared": [ ], + "checks": { }, + }, } - """ - TracerProjectileData: - splash - explosion - sound - - ShapeBaseImageData: - item - projectile - projectileType == projectile.type - """ - def check_datablock_references(self, data, known_datablocks): + def __init__(self, target_directories, process_count = 0): + self._process_count = process_count + self._target_directories = target_directories - # For each file entry - for file in data: - # For each datablock - for datablock in file.datablocks: - if (datablock.type in self.datablock_reference_table): - # Flip through each reference in the table - for reference in self.datablock_reference_table[datablock.type]["references"]: - if (reference not in datablock.properties): - print("Reference Warning: %s datablock '%s' has no '%s' declaration! (Declaration in %s, line %u)" % (datablock.type, datablock.name, reference, datablock.filepath, datablock.line)) - else: - if (datablock.properties[reference] not in known_datablocks.keys()): - print("Reference Warning: %s Datablock '%s' references '%s' in property '%s', which does not exist! (Declaration in %s, line %u)" % (datablock.type, datablock.name, datablock.properties[reference], reference, datablock.filepath, datablock.line)) - - # Check each declaration - for declaration in self.datablock_reference_table[datablock.type]["declared"]: - if (declaration not in datablock.properties): - print("Declaration Warning: %s Datablock '%s' required property '%s' not declared! (Declaration in %s, line %u)" % (datablock.type, datablock.name, declaration, datablock.filepath, datablock.line)) - - # Run custom checks - for check in self.datablock_reference_table[datablock.type]["checks"].keys(): - # Is it declared? - if (check not in datablock.properties): - print("Property Warning: %s Datablock %s '%s' property not declared! (Declaration in %s, line %u)" % (datablock.type, datablock.name, check, datablock.filepath, datablock.line)) - else: - method, message = self.datablock_reference_table[datablock.type]["checks"][check] - - if (not method(datablock.properties[check])): - print("Property Warning (Datablock '%s', type %s. Declaration in %s, line %u): %s" % (datablock.name, datablock.type, datablock.filepath, datablock.line, message)) - else: - print("Program Error: Unknown datablock type '%s'! This means the software does not know how to check this datablock. (Declaration in %s, line %u)" % (datablock.type, datablock.filepath, datablock.line)) + self._log_lines = [ ] + + def get_file_list(self, directory): + output = [ ] + + previous_working_directory = os.getcwd() + os.chdir(directory) + + for root, dirs, files in os.walk("."): + for filename in files: + relative_path = os.path.join(root, filename) + + if (not os.path.isfile(relative_path)): + continue + + absolute_path = os.path.realpath(relative_path) + + # Only check TS files + name, extension = os.path.splitext(filename) + if (extension != ".cs"): + continue - def resolve_datablock_parents(self, data, known_datablocks): - # For each file entry - for file in data: - # For each datablock - for datablock in file.datablocks: - if (datablock.derived is not None and datablock.derived not in known_datablocks.keys()): - print("Warning: Datablock '%s' derives from non-existent parent '%s'! (Declaration in %s, line %u)" % (datablock.name, datablock.derived,datablock.filepath, datablock.line)) - elif (datablock.derived is not None): - datablock.derived = known_datablocks[datablock.derived] + output.append((absolute_path, relative_path.lower())) - def process_data(self, data): - # Entries we've already processed + os.chdir(previous_working_directory) + return output + + def _parse_stage(self, target_files): + results = None + if (self._process_count > 0): + # Create a list with all the required data for the multi-process + input = [ ] + + for target_file in target_files: + input.append((target_file, self.parameter_split, self._combined_pattern)) + + pool = multiprocessing.Pool(processes=self._process_count) + results = pool.map(scrape_file, input) + else: + results = [ ] + + for target_file in target_files: + results.append(scrape_file((target_file, self.parameter_split, self._combined_pattern))) + + return results + + def _declaration_stage(self, parse_results): + # Entries we've already processed processed_entries = { } # For each file entry - for file in data: + known_datablocks = { } + for file in parse_results: # For each global function for global_function in file.global_functions: processed_entries.setdefault(global_function.name, global_function) @@ -434,9 +619,9 @@ class Application(object): processed_entries = { } # For each datablock - known_datablocks = { } for datablock in file.datablocks: processed_entries.setdefault(datablock.name, datablock) + known_datablocks.setdefault(datablock.name, []) known_datablocks[datablock.name].append(datablock) @@ -454,157 +639,91 @@ class Application(object): known_entry.aliases.append(datablock) datablock.aliases.append(known_entry) print("Warning: Datablock '%s' redeclared in %s, line %u! (Original declaration in %s, line %u" % (datablock.name, datablock.filepath, datablock.line, known_entry.filepath, known_entry.line)) - + return known_datablocks - def main(self): - # Load exporters - exporters = { } - for root, dirs, files in os.walk("exporters"): - for filename in files: - module_name, extension = os.path.splitext(filename) - - if (module_name == "__init__"): - continue - - try: - module = importlib.import_module('exporters.%s' % (module_name)) - exporters[module_name] = module - except ImportError as e: - print(e) - - if (len(sys.argv) < 2): - self.print_usage() - return - - if (sys.argv[1] == "exporters"): - print("Available Exporters: ") - - for exporter in exporters.keys(): - print("\t- %s" % exporter) - return - elif(len(sys.argv) != 3): - self.print_usage() - return - - exporter = None - try: - exporter = exporters[sys.argv[2]] - except KeyError as e: - print("Error: No such exporter '%s'." % sys.argv[2]) - self.print_usage() - return - - results = [ ] - global_aliases = { } - typed_aliases = { } - for root, dirs, files in os.walk(sys.argv[1]): - for filename in files: - filepath = os.path.join(root, filename) - - if (not os.path.isfile(filepath)): - continue - - # Only check TS files - name, extension = os.path.splitext(filepath) - if (extension != ".cs"): - continue - - with open(filepath, "r") as handle: - file_entry = FileEntry(filepath) - - file_data = handle.read() - - # Grab Global function definitions - for match in re.finditer(self.function_pattern, file_data): - line = file_data[0:match.start()].count("\n") + 1 - match_split = match.group(0).lstrip().rstrip().lstrip("function ").split("(") - name = match_split[0].lower() - - match_split = re.split(self.parameter_split, match_split[1].replace(")", "")) - - parameters = [ ] - for parameter in match_split: - if (parameter == ""): - continue - - parameters.append(parameter.lstrip().rstrip()) - - file_entry.global_functions.append(Function(name, None, parameters, filepath, line)) - - # Grab bound function definitions - for match in re.finditer(self.bound_function_pattern, file_data): - line = file_data[0:match.start()].count("\n") + 1 - - match_split = match.group(0).lstrip().rstrip().lstrip("function ").split("::") - type = match_split[0].lower() - - match_split = match_split[1].split("(") - name = match_split[0].lower() - match_split = match_split[1].replace(")", "").split(",") - - parameters = [ ] - for parameter in match_split: - if (parameter == ""): - continue - parameters.append(parameter.lstrip().rstrip()) - - file_entry.bound_functions.setdefault(type, []) - file_entry.bound_functions[type].append(Function(name, type, parameters, filepath, line)) - - # Grab non-inherited DB definitions - for match in re.finditer(self.datablock_pattern, file_data): - line = file_data[0:match.start()].count("\n") + 1 - match_text = match.group(0).lstrip().rstrip() - - header = match_text[0:match_text.find("{")] - type = header[len("datablock") + 1:header.find("(")].lstrip().rstrip().lower() - name = header[header.find("(") + 1:header.find(")")].lstrip().rstrip().lower() - - # Inherited? - inherited = None - inheritor = header.find(":") - if (inheritor != -1): - inherited = header[inheritor + 1:].lstrip().rstrip().lower() - - # Blow through key, values - properties = { } - for property_match in re.finditer(self.key_value_pattern, match_text): - property_text = property_match.group(0) - - key, value = re.split(self.assignment_split, property_text, 1) - key = key.lstrip().lower() - - value = value.rstrip().rstrip(";") - - # Global reference - if (value[0] == "$"): - value = Global(value[1:]) - # String - elif (value[0] == "\""): - value = value[1:value.rfind("\"")] - # Numerics - else: - try: - value = float(value) - except ValueError as e: - # If this was raised, treat it as a string - pass - - properties[key] = value - - file_entry.datablocks.append(Datablock(name, type, properties, filepath, line, inherited)) - - # Stick in results - results.append(file_entry) - - known_datablocks = self.process_data(results) - self.resolve_datablock_parents(results, known_datablocks) - self.check_datablock_references(results, known_datablocks) - - # Init the DokuOutput - output = exporter.Exporter(results) - output.write() + def _inheritance_stage(self, parse_results, datablock_list): + # For each file entry + for file in parse_results: + # For each datablock + for datablock in file.datablocks: + if (datablock.derived is not None and datablock.derived.lower() not in datablock_list.keys()): + print("Warning: Datablock '%s' derives from non-existent parent '%s'! (Declaration in %s, line %u)" % (datablock.name, datablock.derived, datablock.filepath, datablock.line)) + elif (datablock.derived is not None): + datablock.derived = datablock_list[datablock.derived] -if __name__ == "__main__": - Application().main() + def _reference_stage(self, parse_results, datablock_list): + # For each file entry + for file in parse_results: + # For each datablock + for datablock in file.datablocks: + if (datablock.type in self._datablock_rules): + # Flip through each reference in the table + for reference in self._datablock_rules[datablock.type]["references"]: + if (reference not in datablock.properties): + print("Reference Warning: %s datablock '%s' has no '%s' declaration! (Declaration in %s, line %u)" % (datablock.type, datablock.name, reference, datablock.filepath, datablock.line)) + else: + if (datablock.properties[reference].lower() not in datablock_list.keys()): + print("Reference Warning: %s Datablock '%s' references '%s' in property '%s', which does not exist! (Declaration in %s, line %u)" % (datablock.type, datablock.name, datablock.properties[reference], reference, datablock.filepath, datablock.line)) + + # Check each declaration + for declaration in self._datablock_rules[datablock.type]["declared"]: + if (declaration not in datablock.properties): + print("Declaration Warning: %s Datablock '%s' required property '%s' not declared! (Declaration in %s, line %u)" % (datablock.type, datablock.name, declaration, datablock.filepath, datablock.line)) + + # Run custom checks + for check in self._datablock_rules[datablock.type]["checks"].keys(): + # Is it declared? + if (check not in datablock.properties): + print("Property Warning: %s Datablock %s '%s' property not declared! (Declaration in %s, line %u)" % (datablock.type, datablock.name, check, datablock.filepath, datablock.line)) + else: + method, message = self._datablock_rules[datablock.type]["checks"][check] + if (not method(datablock.properties[check])): + print("Property Warning (Datablock '%s', type %s. Declaration in %s, line %u): %s" % (datablock.name, datablock.type, datablock.filepath, datablock.line, message)) + else: + print("Program Error: Unknown datablock type '%s'! This means the software does not know how to check this datablock. (Declaration in %s, line %u)" % (datablock.type, datablock.filepath, datablock.line)) + + def process(self): + result = None + + # Process each directory sequentially + target_files = { } + for index, target_directory in enumerate(self._target_directories): + if (os.path.isdir(target_directory) is False): + raise IOError("No such directory to recurse (#%u): '%s'" % (index, target_directory)) + + print("INFO: Building file list for directory '%s' ..." % target_directory) + current_files = self.get_file_list(target_directory) + + # Does a previous entry exist in the target file list? + for current_absolute_path, current_relative_path in current_files: + target_files[current_relative_path] = current_absolute_path + + # Build the list now + target_file_list = [ ] + + for current_relative_file in target_files.keys(): + target_file_list.append(target_files[current_relative_file]) + + # Perform the initial parse + print("INFO: Performing parse stage ...") + parse_results = self._parse_stage(target_file_list) + + # Perform the declaration analysis + print("INFO: Performing declaration analysis. ...") + datablock_list = self._declaration_stage(parse_results) + + # Perform DB inheritance analysis + print("INFO: Performing datablock inheritance analysis ...") + self._inheritance_stage(parse_results, datablock_list) + + # Perform DB reference analysis + print("INFO: Performing datablock reference analysis ...") + self._reference_stage(parse_results, datablock_list) + + # We're done, return the results + print("INFO: Done.") + + return result + + \ No newline at end of file