Improve tinyXml2 output formatting

This commit is contained in:
Lukas Aldershaab 2022-01-02 12:18:25 +01:00
parent 3812ce2e82
commit 165459c90b
2 changed files with 308 additions and 139 deletions

View file

@ -26,6 +26,182 @@
#include "console/console.h"
// Re-implement private functionality in TinyXML2
static const char LINE_FEED = static_cast<char>(0x0a); // all line endings are normalized to LF
static const char LF = LINE_FEED;
static const char CARRIAGE_RETURN = static_cast<char>(0x0d); // CR gets filtered out
static const char CR = CARRIAGE_RETURN;
static const char SINGLE_QUOTE = '\'';
static const char DOUBLE_QUOTE = '\"';
struct Entity {
const char* pattern;
int length;
char value;
};
static const int NUM_ENTITIES = 5;
static const Entity entities[NUM_ENTITIES] = {
{ "quot", 4, DOUBLE_QUOTE },
{ "amp", 3, '&' },
{ "apos", 4, SINGLE_QUOTE },
{ "lt", 2, '<' },
{ "gt", 2, '>' }
};
VfsXMLPrinter::VfsXMLPrinter(FileStream& stream, bool compact, int depth)
: XMLPrinter(NULL, compact, depth),
m_Stream(stream),
_depth(depth)
{
for (int i = 0; i < ENTITY_RANGE; ++i) {
_entityFlag[i] = false;
_restrictedEntityFlag[i] = false;
}
for (int i = 0; i < NUM_ENTITIES; ++i) {
const char entityValue = entities[i].value;
const unsigned char flagIndex = static_cast<unsigned char>(entityValue);
TIXMLASSERT(flagIndex < ENTITY_RANGE);
_entityFlag[flagIndex] = true;
}
_restrictedEntityFlag[static_cast<unsigned char>('&')] = true;
_restrictedEntityFlag[static_cast<unsigned char>('<')] = true;
_restrictedEntityFlag[static_cast<unsigned char>('>')] = true; // not required, but consistency is nice
}
VfsXMLPrinter::~VfsXMLPrinter()
{
m_Stream.flush();
m_Stream.close();
}
void VfsXMLPrinter::PrintString(const char* p, bool restricted)
{
// Look for runs of bytes between entities to print.
const char* q = p;
if (_processEntities) {
const bool* flag = restricted ? _restrictedEntityFlag : _entityFlag;
while (*q) {
TIXMLASSERT(p <= q);
// Remember, char is sometimes signed. (How many times has that bitten me?)
if (*q > 0 && *q < ENTITY_RANGE) {
// Check for entities. If one is found, flush
// the stream up until the entity, write the
// entity, and keep looking.
if (flag[static_cast<unsigned char>(*q)]) {
while (p < q) {
const size_t delta = q - p;
const int toPrint = (INT_MAX < delta) ? INT_MAX : static_cast<int>(delta);
Write(p, toPrint);
p += toPrint;
}
bool entityPatternPrinted = false;
for (int i = 0; i < NUM_ENTITIES; ++i) {
if (entities[i].value == *q) {
Putc('&');
Write(entities[i].pattern, entities[i].length);
Putc(';');
entityPatternPrinted = true;
break;
}
}
if (!entityPatternPrinted) {
// TIXMLASSERT( entityPatternPrinted ) causes gcc -Wunused-but-set-variable in release
TIXMLASSERT(false);
}
++p;
}
}
++q;
TIXMLASSERT(p <= q);
}
// Flush the remaining string. This will be the entire
// string if an entity wasn't found.
if (p < q) {
const size_t delta = q - p;
const int toPrint = (INT_MAX < delta) ? INT_MAX : static_cast<int>(delta);
Write(p, toPrint);
}
}
else {
Write(p);
}
}
bool VfsXMLPrinter::VisitEnter(const tinyxml2::XMLDocument& doc)
{
_processEntities = doc.ProcessEntities();
return XMLPrinter::VisitEnter(doc);
}
bool VfsXMLPrinter::VisitExit(const tinyxml2::XMLElement& element)
{
_depth--;
return XMLPrinter::VisitExit(element);
}
// Add VFS friendly implementations of output functions
void VfsXMLPrinter::Print(const char* format, ...)
{
va_list va;
va_start(va, format);
m_Stream.writeFormattedBuffer(format, va);
va_end(va);
}
void VfsXMLPrinter::Write(const char* data, size_t size)
{
m_Stream.write(size, data);
}
void VfsXMLPrinter::Putc(char ch)
{
m_Stream.write(static_cast<U8>(ch));
}
// Overwrite Visitation of elements to add newlines before attributes
bool VfsXMLPrinter::VisitEnter(const tinyxml2::XMLElement& element, const tinyxml2::XMLAttribute* attribute)
{
const tinyxml2::XMLElement* parentElem = 0;
if (element.Parent()) {
parentElem = element.Parent()->ToElement();
}
const bool compactMode = parentElem ? CompactMode(*parentElem) : CompactMode(element);
OpenElement(element.Name(), compactMode);
_depth++;
while (attribute) {
PushAttribute(attribute->Name(), attribute->Value(), compactMode);
attribute = attribute->Next();
}
return true;
}
void VfsXMLPrinter::PushAttribute(const char* name, const char* value, bool compactMode)
{
TIXMLASSERT(_elementJustOpened);
if (compactMode)
{
Putc(' ');
}
else
{
Putc('\n');
PrintSpace(_depth);
}
Write(name);
Write("=\"");
PrintString(value, false);
Putc('\"');
}
bool VfsXMLDocument::LoadFile(const char* pFilename)
{
// Expand the file-path.
@ -61,6 +237,113 @@ bool VfsXMLDocument::LoadFile(const char* pFilename)
return true;
}
bool VfsXMLDocument::LoadFile(FileStream& stream)
{
// Delete the existing data:
Clear();
// Clear shadowed error
ClearError();
//TODO: Can't clear location, investigate if this gives issues.
//doc.location.Clear();
// Get the file size, so we can pre-allocate the string. HUGE speed impact.
long length = stream.getStreamSize();
// Strange case, but good to handle up front.
if (length <= 0)
{
SetError(tinyxml2::XML_ERROR_EMPTY_DOCUMENT, 0, 0);
return false;
}
// Subtle bug here. TinyXml did use fgets. But from the XML spec:
// 2.11 End-of-Line Handling
// <snip>
// <quote>
// ...the XML processor MUST behave as if it normalized all line breaks in external
// parsed entities (including the document entity) on input, before parsing, by translating
// both the two-character sequence #xD #xA and any #xD that is not followed by #xA to
// a single #xA character.
// </quote>
//
// It is not clear fgets does that, and certainly isn't clear it works cross platform.
// Generally, you expect fgets to translate from the convention of the OS to the c/unix
// convention, and not work generally.
/*
while( fgets( buf, sizeof(buf), file ) )
{
data += buf;
}
*/
char* buf = new char[length + 1];
buf[0] = 0;
if (!stream.read(length, buf))
{
delete[] buf;
SetError(tinyxml2::XML_ERROR_FILE_COULD_NOT_BE_OPENED, 0, 0);
return false;
}
// Process the buffer in place to normalize new lines. (See comment above.)
// Copies from the 'p' to 'q' pointer, where p can advance faster if
// a newline-carriage return is hit.
//
// Wikipedia:
// Systems based on ASCII or a compatible character set use either LF (Line feed, '\n', 0x0A, 10 in decimal) or
// CR (Carriage return, '\r', 0x0D, 13 in decimal) individually, or CR followed by LF (CR+LF, 0x0D 0x0A)...
// * LF: Multics, Unix and Unix-like systems (GNU/Linux, AIX, Xenix, Mac OS X, FreeBSD, etc.), BeOS, Amiga, RISC OS, and others
// * CR+LF: DEC RT-11 and most other early non-Unix, non-IBM OSes, CP/M, MP/M, DOS, OS/2, Microsoft Windows, Symbian OS
// * CR: Commodore 8-bit machines, Apple II family, Mac OS up to version 9 and OS-9
const char* p = buf; // the read head
char* q = buf; // the write head
const char CR = 0x0d;
const char LF = 0x0a;
buf[length] = 0;
while (*p)
{
assert(p < (buf + length));
assert(q <= (buf + length));
assert(q <= p);
if (*p == CR)
{
*q++ = LF;
p++;
if (*p == LF)
{
// check for CR+LF (and skip LF)
p++;
}
}
else
{
*q++ = *p++;
}
}
assert(q <= (buf + length));
*q = 0;
Parse(buf, length);
delete[] buf;
return !Error();
}
bool VfsXMLDocument::SaveFile(FileStream& stream)
{
// Clear any error from the last save, otherwise it will get reported
// for *this* call.
ClearError();
VfsXMLPrinter printer(stream, false, 0);
Print(&printer);
return !Error();
}
bool VfsXMLDocument::SaveFile(const char* pFilename)
{
// Expand the file-name into the file-path buffer.
@ -118,142 +401,3 @@ void VfsXMLDocument::SetError(tinyxml2::XMLError error, int lineNum, const char*
_errorStr.SetStr(buffer);
delete[] buffer;
}
VfsXMLPrinter::VfsXMLPrinter(FileStream& stream, bool compact, int depth)
: XMLPrinter(NULL, compact, depth),
m_Stream(stream)
{
}
VfsXMLPrinter::~VfsXMLPrinter()
{
m_Stream.flush();
m_Stream.close();
}
void VfsXMLPrinter::Print(const char* format, ...)
{
va_list va;
va_start(va, format);
m_Stream.writeFormattedBuffer(format, va);
va_end(va);
}
void VfsXMLPrinter::Write(const char* data, size_t size)
{
m_Stream.write(size, data);
}
void VfsXMLPrinter::Putc(char ch)
{
m_Stream.write(static_cast<U8>(ch));
}
bool VfsXMLDocument::LoadFile(FileStream& stream)
{
// Delete the existing data:
Clear();
// Clear shadowed error
ClearError();
//TODO: Can't clear location, investigate if this gives issues.
//doc.location.Clear();
// Get the file size, so we can pre-allocate the string. HUGE speed impact.
long length = stream.getStreamSize();
// Strange case, but good to handle up front.
if (length <= 0)
{
SetError(tinyxml2::XML_ERROR_EMPTY_DOCUMENT, 0, 0);
return false;
}
// Subtle bug here. TinyXml did use fgets. But from the XML spec:
// 2.11 End-of-Line Handling
// <snip>
// <quote>
// ...the XML processor MUST behave as if it normalized all line breaks in external
// parsed entities (including the document entity) on input, before parsing, by translating
// both the two-character sequence #xD #xA and any #xD that is not followed by #xA to
// a single #xA character.
// </quote>
//
// It is not clear fgets does that, and certainly isn't clear it works cross platform.
// Generally, you expect fgets to translate from the convention of the OS to the c/unix
// convention, and not work generally.
/*
while( fgets( buf, sizeof(buf), file ) )
{
data += buf;
}
*/
char* buf = new char[length + 1];
buf[0] = 0;
if (!stream.read(length, buf))
{
delete [] buf;
SetError(tinyxml2::XML_ERROR_FILE_COULD_NOT_BE_OPENED, 0, 0);
return false;
}
// Process the buffer in place to normalize new lines. (See comment above.)
// Copies from the 'p' to 'q' pointer, where p can advance faster if
// a newline-carriage return is hit.
//
// Wikipedia:
// Systems based on ASCII or a compatible character set use either LF (Line feed, '\n', 0x0A, 10 in decimal) or
// CR (Carriage return, '\r', 0x0D, 13 in decimal) individually, or CR followed by LF (CR+LF, 0x0D 0x0A)...
// * LF: Multics, Unix and Unix-like systems (GNU/Linux, AIX, Xenix, Mac OS X, FreeBSD, etc.), BeOS, Amiga, RISC OS, and others
// * CR+LF: DEC RT-11 and most other early non-Unix, non-IBM OSes, CP/M, MP/M, DOS, OS/2, Microsoft Windows, Symbian OS
// * CR: Commodore 8-bit machines, Apple II family, Mac OS up to version 9 and OS-9
const char* p = buf; // the read head
char* q = buf; // the write head
const char CR = 0x0d;
const char LF = 0x0a;
buf[length] = 0;
while (*p)
{
assert(p < (buf+length));
assert(q <= (buf+length));
assert(q <= p);
if (*p == CR)
{
*q++ = LF;
p++;
if (*p == LF)
{
// check for CR+LF (and skip LF)
p++;
}
}
else
{
*q++ = *p++;
}
}
assert(q <= (buf+length));
*q = 0;
Parse(buf, length);
delete [] buf;
return !Error();
}
bool VfsXMLDocument::SaveFile(FileStream& stream)
{
// Clear any error from the last save, otherwise it will get reported
// for *this* call.
ClearError();
VfsXMLPrinter printer(stream, false, 0);
Print(&printer);
return !Error();
}

View file

@ -40,10 +40,35 @@ public:
VfsXMLPrinter(FileStream& stream, bool compact = false, int depth = 0);
~VfsXMLPrinter() override;
// Re-implement private functionality in TinyXML2 library, this is just a copy-paste job
void PrintString(const char*, bool restrictedEntitySet); // prints out, after detecting entities.
virtual bool VisitEnter(const tinyxml2::XMLDocument& /*doc*/);
virtual bool VisitExit(const tinyxml2::XMLElement& element);
// Add VFS friendly implementations of output functions
void Print(const char* format, ...) override;
void Write(const char* data, size_t size) override;
inline void Write(const char* data) { Write(data, strlen(data)); }
void Putc(char ch) override;
// Overwrite Visitation of elements to add newlines before attributes
virtual bool VisitEnter(const tinyxml2::XMLElement& element, const tinyxml2::XMLAttribute* attribute);
void PushAttribute(const char* name, const char* value, bool compactMode);
// Accept a virtual FileStream instead of a FILE pointer
FileStream& m_Stream;
// Track private fields that are necessary for private functionality in TinyXML2
int _depth;
bool _processEntities;
enum {
ENTITY_RANGE = 64,
BUF_SIZE = 200
};
bool _entityFlag[ENTITY_RANGE];
bool _restrictedEntityFlag[ENTITY_RANGE];
};
class VfsXMLDocument : public tinyxml2::XMLDocument