A minimal SAX parser
This is a follow-up of my notes on a DOM parser.
SAX (Simple API for XML) is an XML-specific parser that overcomes some of the drawbacks of the DOM parser.
Similarly to DOM, the same concepts that define the SAX parser can be used for other types of data, a more generic name for this kind of parser is "event-based parser".
The main idea is that the user provides some callbacks to be invoked while parsing the document. Through those callbacks, the user of the parser can decide what to do.
The immediate and obvious advantage is that all the unnecessary allocations, copies, and conversions between containers when using a DOM parser do not take place; the user can decide to keep only what is relevant and save it directly where it wants to.
Sample implementation
One can take the implementation of the DOM parser, and modify it accordingly.
Instead of returning values, the parse functions do not return anything but call the appropriate callback.
Apart from this change, the implementation looks more or less the same.
#include <string_view>
#include <stdexcept>
#include <cassert>
#include <charconv>
struct json_callbacks{
// default implementation is a nop
virtual void on_null(){}
virtual void on_bool(bool){}
virtual void on_string(std::string_view){}
virtual void on_number(int ){}
virtual void on_key(std::string_view){}
virtual void on_array_begin(){}
virtual void on_array_end(){}
virtual void on_object_begin(){}
virtual void on_object_end(){}
[[noreturn]] virtual void on_error(){throw std::runtime_error("Error while parsing");}
};
bool advance_lit_name(std::string_view& sv, std::string_view v){
if(sv.starts_with(v)) {
sv = sv.substr(v.size());
return true;
}
return false;
}
bool advance_struct_char(std::string_view& sv, char c, json_callbacks& cb){
auto pos_leading_space = sv.find_first_not_of(" \t\n\r");
pos_leading_space = pos_leading_space==std::string_view::npos ? 0 : pos_leading_space;
if(sv.empty() or sv[pos_leading_space] != c) {
return false;
}
switch(c){
// no need to notify for those first two cases as there is an on_key callback
case ':': ; break;
case ',': ; break;
case '[': cb.on_array_begin(); break;
case ']': cb.on_array_end(); break;
case '{': cb.on_object_begin(); break;
case '}': cb.on_object_end(); break;
default : assert(false);
}
sv = sv.substr(pos_leading_space+1);
pos_leading_space = sv.find_first_not_of(" \t\r\n");
if(pos_leading_space!=std::string_view::npos) sv = sv.substr(pos_leading_space);
return true;
}
bool isdigit(char c){ return c >= '0' && c <= '9'; }
void parse_string(std::string_view& sv, json_callbacks& cb, bool key = false){
if(not sv.starts_with('"')) cb.on_error();
auto pos = sv.find('"', 1); // NOTE: real JSON might contain escaped double quotes
if(pos == std::string_view::npos) cb.on_error();
auto v = sv.substr(1, pos-1);
key ? cb.on_key(v) : cb.on_string(v);
sv = sv.substr(pos+1);
}
void parse_number(std::string_view& sv, json_callbacks& cb){
assert(sv[0] == '-' || isdigit(sv[0]));
int number;
auto [ptr, ec] = std::from_chars(sv.data(), sv.data()+sv.size(), number);
if(ec != std::errc{})cb.on_error();
cb.on_number(number);
sv = sv.substr(ptr - sv.data());
}
void parse_value(std::string_view& sv, json_callbacks& cb);
void parse_array(std::string_view& sv, json_callbacks& cb){
auto check = advance_struct_char(sv, '[', cb);
assert(check);
if(advance_struct_char(sv, ']', cb)) return;
do{
parse_value(sv, cb);
}while(advance_struct_char(sv, ',', cb));
if(not advance_struct_char(sv, ']', cb)) cb.on_error();
}
void parse_object(std::string_view& sv, json_callbacks& cb);
void parse_value(std::string_view& sv, json_callbacks& cb) {
if(sv.empty()) cb.on_error();
/**/ if(advance_lit_name(sv, "null") ) cb.on_null();
else if(advance_lit_name(sv, "true") ) cb.on_bool(true);
else if(advance_lit_name(sv, "false")) cb.on_bool(false);
else if(sv[0] == '-' || isdigit(sv[0])) parse_number(sv, cb);
else if(sv[0] == '"') parse_string(sv, cb);
else if(sv[0] == '[') parse_array(sv, cb);
else if(sv[0] == '{') parse_object(sv, cb);
else cb.on_error();
}
void parse_object(std::string_view& sv, json_callbacks& cb) {
if(not advance_struct_char(sv, '{', cb)) cb.on_error();
if(advance_struct_char(sv, '}', cb)){
if(sv.size() > 1) cb.on_error();
return;
}
do{
if(not sv.starts_with('"')) cb.on_error();
parse_string(sv, cb, true);
if(not advance_struct_char(sv, ':', cb)) cb.on_error();
parse_value(sv, cb);
}while(advance_struct_char(sv, ',', cb));
if(not advance_struct_char(sv, '}', cb)) cb.on_error();
if(sv.size() > 1) cb.on_error();
}
void parse_json(std::string_view sv, json_callbacks& cb) {
return parse_object(sv, cb);
}
There is another notable difference to the original DOM algorithm; the parse_string
method won an additional boolean parameter used to differentiate a key from other strings.
This is not strictly necessary; json_callbacks
could drop the on_key
method, but I found it much easier to leave it, and I hope the reason will be clear when looking at how to use this parser.
One example usage, for copying the data from JSON to a user-defined structure, could look like the following:
#include <string>
#include <vector>
struct mydata{
std::string name = "";
int age = -1;
std::vector<std::string> documents;
};
struct mycallbacks : json_callbacks{
mydata d;
std::vector<std::string_view> object_stack;
std::string_view last_key;
explicit mycallbacks() = default;
void on_string(std::string_view v) override {
if(object_stack == std::vector<std::string_view>{""} and last_key == "name") {
d.name = std::string(v);
} else if(object_stack == std::vector<std::string_view>{""} and last_key == "documents" ) {
d.documents.emplace_back(v);
}
}
void on_number(int v) override {
if(object_stack == std::vector<std::string_view>{""} and last_key == "age") {
d.age = v;
}
}
void on_key(std::string_view v) override {last_key = v;}
void on_object_begin() override { object_stack.push_back(last_key); }
void on_object_end() override { object_stack.pop_back(); }
};
int main(){
auto cb = mycallbacks();
auto str = R"({"name":"John", "age": 30, "documents":["1","2","3"] })";
parse_json(str, cb);
assert(cb.d.name == "John");
assert(cb.d.age == 30);
assert(cb.d.documents.size() == 3);
assert(cb.d.documents[0] == "1");
}
While the implementation of the SAX parser is similar to the implementation of the DOM parser, the method for extracting the data is completely different.
The biggest disadvantage is that the callback functions are called without context.
A value is pushed to on_string
, but where does it come from? Is it inside an array or an object? If in an object, which one? For this reason, I decided to provide an on_key
function, otherwise the implementer of on_string
also needed to track if the pushed string was a key or not!
And even with on_key
, the implementation is complex. One needs to remember in which object it is (with the member functions on_key
, on_object_begin
, …) and only then it can decide if the parsed value is the one that needs to be saved or not.
And it is still missing error-handling. If age is a string
and not a number
, we would probably like to know it and act accordingly, instead of silently ignoring the value.
While the SAX parser will generally consume less memory than the DOM parser, it still needs to use additional memory for tracking the context.
Some considerations
As mentioned at the beginning, no matter how big the JSON message is, only the relevant parts are copied over, and memory usage increases only if there are nested JSON structures.
Another advantage of this implementation is that there is no reason to define a type for arrays and objects; the end user can store the values how it prefers, instead of doing conversion.
The presented API uses only int
, bool
, and std::string_view
, which are as cheap as it gets to copy and convert to other formats.
On the other hand, this API is much harder to use.
In particular, inside mycallbacks
, one needs to track the state and type of the data, if not, you might be copying the wrong elements or missing them.
In this sense, there is no context when getting the events.
Another property of this approach is that the document can be traversed only once. This is problematic if you want to save some elements depending from other elements that might come afterwards.
A feature that seems reasonable, but that looks like a can of worms, is being able to control the parsing algorithms with additional callbacks.
It sounds nice, for example, to be able to pause the parsing of a document and resume it afterward. With the current API, if parsing is stopped (for example through an exception), then you must parse the whole data from the beginning. This is problematic if you want, for example, to parse some elements depending on external factors, like the input of a user, but do not want to block inside a callback.
One could extend the json_callback
interface to return something, in order to control the parsing instead of relying on exceptions.
A boolean might be enough to stop the parsing, but at that point, why not return a more complex data type that can be used for skipping elements?
It wouldn’t be unreasonable to want to skip all the following elements in an array or an object. If there are no errors, the parser will parse all elements to be ignored, and the callback could return immediately, minimizing the overhead. In case of invalid JSON, one needs to be prepared to also catch the thrown exception.
Currently, stopping parsing seems to be the only reasonable use case, but since with exceptions it is already possible to stop the parsing at any moment, and assuming that exceptions are not an issue, does it make sense to introduce a second control structure for exiting early?
Supporting resuming means to store somewhere the parsed state (the information that is missing from the callback), saving and restoring it would make the SAX parser much more complex.
A DOM parser on top of a SAX parser
The fact that SAX and DOM parsers have different advantages, seems to imply that ideally, one might need to write two parsers for every format and use the one more appropriate for the current use case.
Writing a SAX parser on top of a DOM parser makes little sense, as using a DOM parser underneath negates most of the advantages of the SAX parses and of the DOM parser.
Fortunately writing a DOM parser on top of a SAX parser requires only some additional overhead compared to writing a DOM parser directly, which makes it in my opinion a sensible choice:
struct value_t{
std::variant<bool, int, std::string, std::nullptr_t, std::vector<value_t>, std::map<std::string, value_t>> data;
};
template <class V>
struct stack_emplacer{
std::string_view current_key;
V v;
template <class T>
V* operator()(T* arg) const {
if(arg == nullptr) return nullptr; // in case of duplicated key entries, subsequent keys are ignored (but still parsed)
if constexpr (std::is_same_v<T, std::map<std::string, value_t>>){
auto [it, inserted] = arg->emplace(current_key, v);
return std::get_if<V>(&(it->second.data));
}else if constexpr (std::is_same_v<T, std::vector<value_t>>){
arg->emplace_back(v);
return std::get_if<V>(&(arg->back().data));
}else {
static_assert(false, "non-exhaustive visitor!");
}
}
};
template<class V> stack_emplacer(std::string_view, V) -> stack_emplacer<V>;
struct createdom_callbacks : json_callbacks{
std::map<std::string, value_t> parsed;
std::vector<std::variant<std::map<std::string, value_t>*, std::vector<value_t>*>> object_arr_stack;
std::string_view current_key;
explicit createdom_callbacks() = default;
void on_string(std::string_view v) override {
assert(not object_arr_stack.empty());
std::visit( stack_emplacer{current_key, std::string(v)}, object_arr_stack.back());
}
void on_number(int v) override {
assert(not object_arr_stack.empty());
std::visit( stack_emplacer{current_key, v}, object_arr_stack.back());
}
void on_null() override {
assert(not object_arr_stack.empty());
std::visit( stack_emplacer{current_key, std::nullptr_t{}}, object_arr_stack.back());
}
void on_bool(bool v) override {
assert(not object_arr_stack.empty());
std::visit( stack_emplacer{current_key, v}, object_arr_stack.back());
}
void on_key(std::string_view v) override {
current_key = v;
}
void on_object_begin() override {
std::map<std::string, value_t>* ptr = object_arr_stack.empty() ?
&parsed :
std::visit( stack_emplacer{current_key, std::map<std::string, value_t>{}}, object_arr_stack.back());
object_arr_stack.push_back(ptr);
}
virtual void on_array_begin() override {
assert(not object_arr_stack.empty());
std::vector<value_t>* ptr = std::visit( stack_emplacer{current_key, std::vector<value_t>{}}, object_arr_stack.back());
object_arr_stack.push_back(ptr);
}
void on_object_end() override {
assert(not object_arr_stack.empty());
assert((std::get_if<std::map<std::string, value_t>*>(&object_arr_stack.back()) != nullptr));
object_arr_stack.pop_back();
}
void on_array_end() override {
assert(not object_arr_stack.empty());
assert(std::get_if<std::vector<value_t>*>(&object_arr_stack.back()) != nullptr);
object_arr_stack.pop_back();
assert(not object_arr_stack.empty());
}
};
int main(){
auto str = R"({"name":"John", "age": 30, "documents":["1","2",["3", 4], {
"name": "Bob",
"age": 28
}] })";
auto cb = createdom_callbacks();
parse_json(str, cb);
mprint(cb.parsed);
}
The additional overhead is object_arr_stack
, which acts as a stack of elements to track the current context.
Thus the most sensible approach seems to be to implement a SAX parser and if it makes sense, a DOM parser can be created on top of it.
An alternate approach to object_arr_stack
would be that every parsed JSON object stores a reference to its parent, but that would mean creating wrapper types for every object and increasing the size of every object too.
If you have questions, comments, or found typos, the notes are not clear, or there are some errors; then just contact me.