LP#1709710: Make chunk sizing smart about XML quoting
authorMike Rylander <miker@esilibrary.com>
Fri, 18 Aug 2017 15:43:31 +0000 (11:43 -0400)
committerGalen Charlton <gmc@equinoxinitiative.org>
Mon, 11 Sep 2017 13:30:03 +0000 (09:30 -0400)
XML inside JSON as a quoted string that's itself inside XML causes quite the
pile up of nested excaping of certain characters in OpenSRF PARTIAL_RESPONSE
messages.  Here we check for the worst offenders (<, >, &, and ") and account
for the cost of escaping them in chunked response stanzas.

Signed-off-by: Mike Rylander <mrylander@gmail.com>
Signed-off-by: Galen Charlton <gmc@equinoxinitiative.org>
Signed-off-by: Bill Erickson <berickxx@gmail.com>
Signed-off-by: Jason Stephenson <jason@sigio.com>
Signed-off-by: Galen Charlton <gmc@equinoxinitiative.org>

include/opensrf/utils.h
src/libopensrf/osrf_app_session.c
src/libopensrf/osrf_application.c
src/libopensrf/utils.c
src/perl/lib/OpenSRF/AppSession.pm

index 2276dd6..34e0ba6 100644 (file)
@@ -377,6 +377,12 @@ char* md5sum( const char* text, ... );
 */
 int osrfUtilsCheckFileDescriptor( int fd );
 
+/*
+       Returns the approximate additional length of
+       a string after XML escaping <, >, &, and ".
+*/
+size_t osrfXmlEscapingLength ( const char* str );
+
 #ifdef __cplusplus
 }
 #endif
index 5633e1b..28242e7 100644 (file)
@@ -1363,13 +1363,20 @@ int osrfAppRequestRespondComplete(
                        OSRF_STATUS_COMPLETE );
 
        if (data) {
-
                char* json = jsonObjectToJSON(data);
-               size_t data_size = strlen(json);
+               size_t raw_size = strlen(json);
+               size_t extra_size = osrfXmlEscapingLength(json);
+               size_t data_size = raw_size + extra_size;
                size_t chunk_size = OSRF_MSG_CHUNK_SIZE;
-               if (chunk_size > 0 && chunk_size < data_size) {
 
-                       osrfSendChunkedResult(ses, requestId, json, data_size, chunk_size);
+               if (data_size > chunk_size) // calculate an escape-scaled chunk size
+                       chunk_size = ((double)raw_size / (double)data_size) * (double)chunk_size;
+
+               if (chunk_size > 0 && chunk_size < raw_size) {
+                       // chunking -- response message exceeds max message size.
+                       // break it up into chunks for partial delivery
+
+                       osrfSendChunkedResult(ses, requestId, json, raw_size, chunk_size);
                        osrfAppSessionSendBatch( ses, &status, 1 );
 
                } else {
index ca6c219..4a5f53e 100644 (file)
@@ -734,15 +734,20 @@ static int _osrfAppRespond( osrfMethodContext* ctx, const jsonObject* data, int
 
                if( data ) {
             char* data_str = jsonObjectToJSON(data); // free me (below)
-            size_t data_size = strlen(data_str);
+            size_t raw_size = strlen(data_str);
+            size_t extra_size = osrfXmlEscapingLength(data_str);
+            size_t data_size = raw_size + extra_size;
             size_t chunk_size = ctx->method->max_chunk_size;
 
-            if (chunk_size > 0 && chunk_size < data_size) {
+            if (data_size > chunk_size) // calculate an escape-scaled chunk size
+                chunk_size = ((double)raw_size / (double)data_size) * (double)chunk_size;
+
+            if (chunk_size > 0 && chunk_size < raw_size) {
                 // chunking -- response message exceeds max message size.
                 // break it up into chunks for partial delivery
 
                                osrfSendChunkedResult(ctx->session, ctx->request,
-                                                                         data_str, data_size, chunk_size);
+                                                                         data_str, raw_size, chunk_size);
 
             } else {
 
index 6628c8c..1c049c0 100644 (file)
@@ -781,3 +781,26 @@ int osrfUtilsCheckFileDescriptor( int fd ) {
        return 0;
 }
 
+size_t osrfXmlEscapingLength ( const char* str ) {
+       int extra = 0;
+       const char* s;
+       for (s = str; *s; ++s) {
+               switch (*s) {
+                       case '>':
+                       case '<':
+                               extra += 3;
+                               break;
+                       case '&':
+                               extra += 4;
+                               break;
+                       case '"':
+                               extra += 11;
+                               break;
+                       default:
+                               break;
+               }
+       }
+
+       return extra;
+}
+
index 36d56b0..bb99787 100644 (file)
@@ -1051,10 +1051,28 @@ sub respond {
 
         if ($self->max_chunk_size > 0) { # we might need to chunk
             my $str = OpenSRF::Utils::JSON->perl2JSON($msg);
-            if (length($str) > $self->max_chunk_size) { # send partials ("chunking")
-                for (my $i = 0; $i < length($str); $i += $self->max_chunk_size) {
+
+            # XML can add a lot of length to a chunk due to escaping, so we
+            # calculate chunk size based on an XML-escaped version of the message.
+            # Example: If escaping doubles the length of the string then $ratio
+            # will be 0.5 and we'll cut the chunk size for this message in half.
+
+            my $raw_length = length($str);
+            my $escaped_length = $raw_length;
+            $escaped_length += 11 * (() = ( $str =~ /"/g)); # 7 \s and &quot;
+            $escaped_length += 4 * (() = ( $str =~ /&/g)); # &amp;
+            $escaped_length += 3 * (() = ( $str =~ /[<>]/g)); # &lt; / &gt;
+
+            my $chunk_size = $self->max_chunk_size;
+
+            if ($escaped_length > $self->max_chunk_size) {
+                $chunk_size = ($raw_length / $escaped_length) * $self->max_chunk_size;
+            }
+
+            if ($raw_length > $chunk_size) { # send partials ("chunking")
+                for (my $i = 0; $i < length($str); $i += $chunk_size) {
                     $response = new OpenSRF::DomainObject::oilsResult::Partial;
-                    $response->content( substr($str, $i, $self->max_chunk_size) );
+                    $response->content( substr($str, $i, $chunk_size) );
                     $self->session->send($type, $response, $self->threadTrace);
                 }
                 # This triggers reconstruction on the remote end