Merge pull request #717 from lukaspj/fix/improve-taml-xml-formatting-no-tampering

Improve tinyXml2 output formatting
This commit is contained in:
Brian Roberts 2022-01-17 05:03:24 -06:00 committed by GitHub
commit e2a2f26828
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
2 changed files with 345 additions and 125 deletions

View file

@ -26,6 +26,41 @@
#include "console/console.h"
VfsXMLPrinter::VfsXMLPrinter(FileStream& stream, bool compact, int depth)
: XMLPrinter(NULL, compact, depth),
m_Stream(stream)
{
}
VfsXMLPrinter::~VfsXMLPrinter()
{
m_Stream.flush();
m_Stream.close();
}
// Add VFS friendly implementations of output functions
void VfsXMLPrinter::Print(const char* format, ...)
{
va_list va;
va_start(va, format);
m_Stream.writeFormattedBuffer(format, va);
va_end(va);
}
void VfsXMLPrinter::Write(const char* data, size_t size)
{
m_Stream.write(size, data);
}
void VfsXMLPrinter::Putc(char ch)
{
m_Stream.write(static_cast<U8>(ch));
}
bool VfsXMLDocument::LoadFile(const char* pFilename)
{
// Expand the file-path.
@ -61,6 +96,114 @@ bool VfsXMLDocument::LoadFile(const char* pFilename)
return true;
}
bool VfsXMLDocument::LoadFile(FileStream& stream)
{
// Delete the existing data:
Clear();
// Clear shadowed error
ClearError();
//TODO: Can't clear location, investigate if this gives issues.
//doc.location.Clear();
// Get the file size, so we can pre-allocate the string. HUGE speed impact.
long length = stream.getStreamSize();
// Strange case, but good to handle up front.
if (length <= 0)
{
SetError(tinyxml2::XML_ERROR_EMPTY_DOCUMENT, 0, 0);
return false;
}
// Subtle bug here. TinyXml did use fgets. But from the XML spec:
// 2.11 End-of-Line Handling
// <snip>
// <quote>
// ...the XML processor MUST behave as if it normalized all line breaks in external
// parsed entities (including the document entity) on input, before parsing, by translating
// both the two-character sequence #xD #xA and any #xD that is not followed by #xA to
// a single #xA character.
// </quote>
//
// It is not clear fgets does that, and certainly isn't clear it works cross platform.
// Generally, you expect fgets to translate from the convention of the OS to the c/unix
// convention, and not work generally.
/*
while( fgets( buf, sizeof(buf), file ) )
{
data += buf;
}
*/
char* buf = new char[length + 1];
buf[0] = 0;
if (!stream.read(length, buf))
{
delete[] buf;
SetError(tinyxml2::XML_ERROR_FILE_COULD_NOT_BE_OPENED, 0, 0);
return false;
}
// Process the buffer in place to normalize new lines. (See comment above.)
// Copies from the 'p' to 'q' pointer, where p can advance faster if
// a newline-carriage return is hit.
//
// Wikipedia:
// Systems based on ASCII or a compatible character set use either LF (Line feed, '\n', 0x0A, 10 in decimal) or
// CR (Carriage return, '\r', 0x0D, 13 in decimal) individually, or CR followed by LF (CR+LF, 0x0D 0x0A)...
// * LF: Multics, Unix and Unix-like systems (GNU/Linux, AIX, Xenix, Mac OS X, FreeBSD, etc.), BeOS, Amiga, RISC OS, and others
// * CR+LF: DEC RT-11 and most other early non-Unix, non-IBM OSes, CP/M, MP/M, DOS, OS/2, Microsoft Windows, Symbian OS
// * CR: Commodore 8-bit machines, Apple II family, Mac OS up to version 9 and OS-9
const char* p = buf; // the read head
char* q = buf; // the write head
const char CR = 0x0d;
const char LF = 0x0a;
buf[length] = 0;
while (*p)
{
assert(p < (buf + length));
assert(q <= (buf + length));
assert(q <= p);
if (*p == CR)
{
*q++ = LF;
p++;
if (*p == LF)
{
// check for CR+LF (and skip LF)
p++;
}
}
else
{
*q++ = *p++;
}
}
assert(q <= (buf + length));
*q = 0;
Parse(buf, length);
delete[] buf;
return !Error();
}
bool VfsXMLDocument::SaveFile(FileStream& stream)
{
// Clear any error from the last save, otherwise it will get reported
// for *this* call.
ClearError();
VfsXMLPrinter printer(stream, false, 0);
PrettyXMLPrinter prettyPrinter(printer);
Print(&prettyPrinter);
return !Error();
}
bool VfsXMLDocument::SaveFile(const char* pFilename)
{
// Expand the file-name into the file-path buffer.
@ -119,141 +262,110 @@ void VfsXMLDocument::SetError(tinyxml2::XMLError error, int lineNum, const char*
delete[] buffer;
}
VfsXMLPrinter::VfsXMLPrinter(FileStream& stream, bool compact, int depth)
: XMLPrinter(NULL, compact, depth),
m_Stream(stream)
// Overwrite Visitation of elements to add newlines before attributes
PrettyXMLPrinter::PrettyXMLPrinter(VfsXMLPrinter& innerPrinter, int depth)
: mInnerPrinter(innerPrinter),
mDepth(depth)
{
}
VfsXMLPrinter::~VfsXMLPrinter()
{
m_Stream.flush();
m_Stream.close();
}
void VfsXMLPrinter::Print(const char* format, ...)
{
va_list va;
va_start(va, format);
m_Stream.writeFormattedBuffer(format, va);
va_end(va);
}
void VfsXMLPrinter::Write(const char* data, size_t size)
{
m_Stream.write(size, data);
}
void VfsXMLPrinter::Putc(char ch)
{
m_Stream.write(static_cast<U8>(ch));
}
bool VfsXMLDocument::LoadFile(FileStream& stream)
{
// Delete the existing data:
Clear();
// Clear shadowed error
ClearError();
//TODO: Can't clear location, investigate if this gives issues.
//doc.location.Clear();
// Get the file size, so we can pre-allocate the string. HUGE speed impact.
long length = stream.getStreamSize();
// Strange case, but good to handle up front.
if (length <= 0)
{
SetError(tinyxml2::XML_ERROR_EMPTY_DOCUMENT, 0, 0);
return false;
for (int i = 0; i < ENTITY_RANGE; ++i) {
mEntityFlag[i] = false;
mRestrictedEntityFlag[i] = false;
}
// Subtle bug here. TinyXml did use fgets. But from the XML spec:
// 2.11 End-of-Line Handling
// <snip>
// <quote>
// ...the XML processor MUST behave as if it normalized all line breaks in external
// parsed entities (including the document entity) on input, before parsing, by translating
// both the two-character sequence #xD #xA and any #xD that is not followed by #xA to
// a single #xA character.
// </quote>
//
// It is not clear fgets does that, and certainly isn't clear it works cross platform.
// Generally, you expect fgets to translate from the convention of the OS to the c/unix
// convention, and not work generally.
/*
while( fgets( buf, sizeof(buf), file ) )
{
data += buf;
for (int i = 0; i < NUM_ENTITIES; ++i) {
const char entityValue = entities[i].value;
const unsigned char flagIndex = static_cast<unsigned char>(entityValue);
TIXMLASSERT(flagIndex < ENTITY_RANGE);
mEntityFlag[flagIndex] = true;
}
*/
mRestrictedEntityFlag[static_cast<unsigned char>('&')] = true;
mRestrictedEntityFlag[static_cast<unsigned char>('<')] = true;
mRestrictedEntityFlag[static_cast<unsigned char>('>')] = true; // not required, but consistency is nice
}
char* buf = new char[length + 1];
buf[0] = 0;
void PrettyXMLPrinter::PrintString(const char* p, bool restricted)
{
// Look for runs of bytes between entities to print.
const char* q = p;
if (!stream.read(length, buf))
{
delete [] buf;
SetError(tinyxml2::XML_ERROR_FILE_COULD_NOT_BE_OPENED, 0, 0);
return false;
}
// Process the buffer in place to normalize new lines. (See comment above.)
// Copies from the 'p' to 'q' pointer, where p can advance faster if
// a newline-carriage return is hit.
//
// Wikipedia:
// Systems based on ASCII or a compatible character set use either LF (Line feed, '\n', 0x0A, 10 in decimal) or
// CR (Carriage return, '\r', 0x0D, 13 in decimal) individually, or CR followed by LF (CR+LF, 0x0D 0x0A)...
// * LF: Multics, Unix and Unix-like systems (GNU/Linux, AIX, Xenix, Mac OS X, FreeBSD, etc.), BeOS, Amiga, RISC OS, and others
// * CR+LF: DEC RT-11 and most other early non-Unix, non-IBM OSes, CP/M, MP/M, DOS, OS/2, Microsoft Windows, Symbian OS
// * CR: Commodore 8-bit machines, Apple II family, Mac OS up to version 9 and OS-9
const char* p = buf; // the read head
char* q = buf; // the write head
const char CR = 0x0d;
const char LF = 0x0a;
buf[length] = 0;
while (*p)
{
assert(p < (buf+length));
assert(q <= (buf+length));
assert(q <= p);
if (*p == CR)
{
*q++ = LF;
p++;
if (*p == LF)
{
// check for CR+LF (and skip LF)
p++;
if (mProcessEntities) {
const bool* flag = restricted ? mRestrictedEntityFlag : mEntityFlag;
while (*q) {
TIXMLASSERT(p <= q);
// Remember, char is sometimes signed. (How many times has that bitten me?)
if (*q > 0 && *q < ENTITY_RANGE) {
// Check for entities. If one is found, flush
// the stream up until the entity, write the
// entity, and keep looking.
if (flag[static_cast<unsigned char>(*q)]) {
while (p < q) {
const size_t delta = q - p;
const int toPrint = (INT_MAX < delta) ? INT_MAX : static_cast<int>(delta);
mInnerPrinter.Write(p, toPrint);
p += toPrint;
}
bool entityPatternPrinted = false;
for (int i = 0; i < NUM_ENTITIES; ++i) {
if (entities[i].value == *q) {
mInnerPrinter.Putc('&');
mInnerPrinter.Write(entities[i].pattern, entities[i].length);
mInnerPrinter.Putc(';');
entityPatternPrinted = true;
break;
}
}
if (!entityPatternPrinted) {
// TIXMLASSERT( entityPatternPrinted ) causes gcc -Wunused-but-set-variable in release
TIXMLASSERT(false);
}
++p;
}
}
++q;
TIXMLASSERT(p <= q);
}
else
{
*q++ = *p++;
// Flush the remaining string. This will be the entire
// string if an entity wasn't found.
if (p < q) {
const size_t delta = q - p;
const int toPrint = (INT_MAX < delta) ? INT_MAX : static_cast<int>(delta);
mInnerPrinter.Write(p, toPrint);
}
}
assert(q <= (buf+length));
*q = 0;
Parse(buf, length);
delete [] buf;
return !Error();
else {
mInnerPrinter.Write(p);
}
}
bool VfsXMLDocument::SaveFile(FileStream& stream)
bool PrettyXMLPrinter::VisitEnter(const tinyxml2::XMLElement& element, const tinyxml2::XMLAttribute* attribute)
{
// Clear any error from the last save, otherwise it will get reported
// for *this* call.
ClearError();
VfsXMLPrinter printer(stream, false, 0);
Print(&printer);
return !Error();
const tinyxml2::XMLElement* parentElem = 0;
if (element.Parent()) {
parentElem = element.Parent()->ToElement();
}
const bool compactMode = parentElem ? mInnerPrinter.CompactMode(*parentElem) : mInnerPrinter.CompactMode(element);
mInnerPrinter.OpenElement(element.Name(), compactMode);
mDepth++;
while (attribute) {
PushAttribute(attribute->Name(), attribute->Value(), compactMode);
attribute = attribute->Next();
}
return true;
}
void PrettyXMLPrinter::PushAttribute(const char* name, const char* value, bool compactMode)
{
if (compactMode)
{
mInnerPrinter.Putc(' ');
}
else
{
mInnerPrinter.Putc('\n');
mInnerPrinter.PrintSpace(mDepth);
}
mInnerPrinter.Write(name);
mInnerPrinter.Write("=\"");
PrintString(value, false);
mInnerPrinter.Putc('\"');
}

View file

@ -40,9 +40,18 @@ public:
VfsXMLPrinter(FileStream& stream, bool compact = false, int depth = 0);
~VfsXMLPrinter() override;
// Re-implement protected functionality in TinyXML2 library, and make it public
// (This is a bit dirty, but it's necessary for the PrettyXMLPrinter)
bool CompactMode(const tinyxml2::XMLElement& element) override { return tinyxml2::XMLPrinter::CompactMode(element); }
void PrintSpace(int depth) override { tinyxml2::XMLPrinter::PrintSpace(depth); }
inline void Write(const char* data) { Write(data, strlen(data)); }
// Add VFS friendly implementations of output functions
void Print(const char* format, ...) override;
void Write(const char* data, size_t size) override;
void Putc(char ch) override;
// Accept a virtual FileStream instead of a FILE pointer
FileStream& m_Stream;
};
@ -127,4 +136,103 @@ public:
}
};
class PrettyXMLPrinter : public tinyxml2::XMLPrinter
{
// Re-implement private functionality in TinyXML2
static const char LINE_FEED = static_cast<char>(0x0a); // all line endings are normalized to LF
static const char LF = LINE_FEED;
static const char CARRIAGE_RETURN = static_cast<char>(0x0d); // CR gets filtered out
static const char CR = CARRIAGE_RETURN;
static const char SINGLE_QUOTE = '\'';
static const char DOUBLE_QUOTE = '\"';
struct Entity
{
const char* pattern;
int length;
char value;
};
static const int NUM_ENTITIES = 5;
static constexpr Entity entities[NUM_ENTITIES] = {
{"quot", 4, DOUBLE_QUOTE},
{"amp", 3, '&'},
{"apos", 4, SINGLE_QUOTE},
{"lt", 2, '<'},
{"gt", 2, '>'}
};
public:
PrettyXMLPrinter(VfsXMLPrinter& innerPrinter, int depth = 0);
/// Visit a document.
virtual bool VisitEnter(const tinyxml2::XMLDocument& doc)
{
mProcessEntities = doc.ProcessEntities();
return mInnerPrinter.VisitEnter(doc);
}
/// Visit a document.
virtual bool VisitExit(const tinyxml2::XMLDocument& doc)
{
return mInnerPrinter.VisitExit(doc);
}
/// Visit an element.
virtual bool VisitEnter(const tinyxml2::XMLElement& element, const tinyxml2::XMLAttribute* firstAttribute);
/// Visit an element.
virtual bool VisitExit(const tinyxml2::XMLElement& element)
{
mDepth--;
return mInnerPrinter.VisitExit(element);
}
/// Visit a declaration.
virtual bool Visit(const tinyxml2::XMLDeclaration& declaration)
{
return mInnerPrinter.Visit(declaration);
}
/// Visit a text node.
virtual bool Visit(const tinyxml2::XMLText& text)
{
return mInnerPrinter.Visit(text);
}
/// Visit a comment node.
virtual bool Visit(const tinyxml2::XMLComment& comment)
{
return mInnerPrinter.Visit(comment);
}
/// Visit an unknown node.
virtual bool Visit(const tinyxml2::XMLUnknown& unknown)
{
return mInnerPrinter.Visit(unknown);
}
void PushAttribute(const char* name, const char* value, bool compactMode);
// Re-implement private functionality in TinyXML2 library, this is just a copy-paste job
void PrintString(const char*, bool restrictedEntitySet); // prints out, after detecting entities.
// The inner printer we are wrapping, we only support VfsXMLPrinter based classes because
// stock tinyxml printer is very closed
VfsXMLPrinter& mInnerPrinter;
// Track private fields that are necessary for private functionality in TinyXML2
int mDepth;
bool mProcessEntities;
bool mCompactMode;
enum
{
ENTITY_RANGE = 64,
BUF_SIZE = 200
};
bool mEntityFlag[ENTITY_RANGE];
bool mRestrictedEntityFlag[ENTITY_RANGE];
};
#endif //_FSTINYXML_H_