uapi-json icon indicating copy to clipboard operation
uapi-json copied to clipboard

Investigate new parsers

Open dchertousov opened this issue 4 years ago • 5 comments

Investigate parsers:

  • xml2js without custom configuration
  • xpath

dchertousov avatar Jun 28 '20 18:06 dchertousov

I've done some investigation with xpath and raw xml2js. Looks like xml2js is faster option, though it does not really matter, as difference is less than 200ms, for 1000 files parsing.

Attached is sample script I used. Please take a look and compare, which syntax do you really prefer to be used for next-gen parser: xml2js dotted like or xpath with expressions.

const fs = require('fs');
const path = require('path');
const xpath = require('xpath');
const Dom = require('xmldom').DOMParser;
const xml2js = require('xml2js');

const TRIES = 1000;

const testFilePath = path.join(__dirname, 'test', 'FakeResponses', 'Air', 'AirGetTickets-several-tickets.xml');
const testFile = fs.readFileSync(testFilePath).toString();

function getNamespaceInfo(str, namespace) {
  const re = new RegExp(`xmlns:(${namespace}(?:_v\\d+_\\d+)?)="([^\\"]+)"`);
  const [, documentNamespace, url] = str.match(re);
  return { documentNamespace, url };
}

const fn = {
  xml2js: {
    getNamespaces: (str, list) => {
      return list.reduce(
        (acc, namespace) => ({
          ...acc,
          [namespace]: getNamespaceInfo(str, namespace).documentNamespace,
        }),
        {}
      );
    },
    parsePassenger: (ns, passenger) => {
      const key = passenger.$.Key;
      const { First: firstName, Last: lastName } = passenger[`${ns.common}:BookingTravelerName`][0].$;

      return {
        key,
        firstName,
        lastName,
      };
    },
    parseCoupon: (coupon) => {
      const {
        MarketingCarrier: airline,
        MarketingFlightNumber: flightNumber,
        Origin: origin,
        Destination: destination,
        DepartureTime: departure,
        NotValidBefore: notValidBefore,
        NotValidAfter: notValidAfter,
        Status: status,
      } = coupon.$;

      return {
        airline,
        flightNumber,
        origin,
        destination,
        departure,
        notValidBefore,
        notValidAfter,
        status,
      };
    },
    parseTicket: (ns, ticket) => {
      const { TicketNumber: ticketNumber } = ticket.$;
      const coupons = ticket[`${ns.air}:Coupon`]
        .map(coupon => fn.xml2js.parseCoupon(coupon));

      return {
        ticketNumber,
        coupons,
      };
    },
    parseEtr: (ns, etr) => {
      const locatorCode = etr[`${ns.air}:AirReservationLocatorCode`][0];
      const fareCalculation = etr[`${ns.air}:FareCalc`][0];
      const tickets = etr[`${ns.air}:Ticket`]
        .map(ticket => fn.xml2js.parseTicket(ns, ticket));
      const passengers = etr[`${ns.common}:BookingTraveler`]
        .map(passenger => fn.xml2js.parsePassenger(ns, passenger));

      return {
        locatorCode,
        fareCalculation,
        passengers,
        tickets,
      };
    },
    parse: async (str) => {
      const obj = await new Promise((res, rej) => {
        xml2js.parseString(str, (err, resp) => {
          if (err) {
            rej(err);
            return;
          }
          res(resp);
        });
      });
      const ns = fn.xml2js.getNamespaces(str, ['air', 'common']);
      // eslint-disable-next-line no-unused-vars
      const tickets = obj['SOAP:Envelope']['SOAP:Body'][0][`${ns.air}:AirRetrieveDocumentRsp`][0][`${ns.air}:ETR`]
        .map(etr => fn.xml2js.parseEtr(ns, etr));
      // console.log(JSON.stringify(tickets, null, 2));
    }
  },
  xpath: {
    getAttributes: (element, attributesMap) => {
      return Object.entries(attributesMap).reduce(
        (acc, [attributeName, key]) => ({
          ...acc,
          [key]: element.getAttribute(attributeName)
        }),
        {}
      );
    },
    getNamespaces: (str, list) => {
      return list.reduce(
        (acc, namespace) => ({
          ...acc,
          [namespace]: getNamespaceInfo(str, namespace).url,
        }),
        {}
      );
    },
    parsePassenger: (passenger, expr) => {
      const key = passenger.getAttribute('Key');
      const [travelerName] = expr('./common:BookingTravelerName', passenger);
      // const firstName = travelerName.getAttribute('First');
      // const lastName = travelerName.getAttribute('Last');
      return {
        key,
        ...fn.xpath.getAttributes(travelerName, { First: 'firstName', Last: 'lastName' }),
        // firstName,
        // lastName,
      };
    },
    parseCoupon: (coupon) => {
      return fn.xpath.getAttributes(coupon, {
        MarketingCarrier: 'airline',
        MarketingFlightNumber: 'flightNumber',
        Origin: 'origin',
        Destination: 'destination',
        DepartureTime: 'departure',
        NotValidBefore: 'notValidBefore',
        NotValidAfter: 'notValidAfter',
        Status: 'status',
      });
    },
    parseTicket: (ticket, expr) => {
      const ticketNumber = ticket.getAttribute('TicketNumber');
      const coupons = expr('./air:Coupon', ticket)
        .map(coupon => fn.xpath.parseCoupon(coupon));
      return {
        ticketNumber,
        coupons,
      };
    },
    parseEtr: (etr, expr) => {
      const locatorCode = expr('string(./air:AirReservationLocatorCode)', etr);
      const fareCalculation = expr('string(./air:FareCalc)', etr);
      const tickets = expr('./air:Ticket', etr)
        .map(ticket => fn.xpath.parseTicket(ticket, expr));
      const passengers = expr('./common:BookingTraveler', etr)
        .map(passenger => fn.xpath.parsePassenger(passenger, expr));

      return {
        locatorCode,
        fareCalculation,
        passengers,
        tickets,
      };
    },
    parse: (str) => {
      const res = new Dom().parseFromString(str);
      const namespaces = fn.xpath.getNamespaces(str, ['SOAP', 'air', 'common']);
      const expr = xpath.useNamespaces(namespaces);
      // eslint-disable-next-line no-unused-vars
      const tickets = expr('/SOAP:Envelope/SOAP:Body/air:AirRetrieveDocumentRsp/air:ETR', res)
        .map(ticket => fn.xpath.parseEtr(ticket, expr));
      // console.log(JSON.stringify(tickets, null, 2));
    },
  },
};

async function main() {
  let i;

  // Xpath
  console.time('Xpath');
  for (i = 0; i < TRIES; i += 1) {
    fn.xpath.parse(testFile);
  }
  console.timeEnd('Xpath');

  // Xml2Js
  console.time('Xml2Js');
  for (i = 0; i < TRIES; i += 1) {
    // eslint-disable-next-line no-await-in-loop
    await fn.xml2js.parse(testFile);
  }
  console.timeEnd('Xml2Js');
}

main();

dchertousov avatar Jun 28 '20 21:06 dchertousov

I prefer xpath as it resembles more original XML structure, therefore is more readable.

I would really like some active users and contributors to join discussion, as if code become less complex, more useful contributions to be expected

@nausik , @cursedcoder , @shmuga , @InkFaust , @Smotrov , @creduo , @yaerhf , @articice , @jthoma , @engrashid , @JairoPanduro

dchertousov avatar Jun 28 '20 21:06 dchertousov

Despite that xml2js might be slightly faster than xpath with 200ms difference as you've mentioned, I think xpath is much easier to read and understand. If the speed difference doesn't matter that much, I'd prefer to stick with xpath rather than xml2js.

I'd also love to hear from other contributors on this matter.

mark-omarov avatar Jun 30 '20 09:06 mark-omarov

xpath <3

cursedcoder avatar Jun 30 '20 10:06 cursedcoder

Readability matters more, since that helps adaptability by more contributors. I would go with xpath for this one.

On Tue, Jun 30, 2020, 4:26 PM Yevhenii Huselietov [email protected] wrote:

xpath <3

— You are receiving this because you were mentioned. Reply to this email directly, view it on GitHub https://github.com/Travelport-Ukraine/uapi-json/issues/483#issuecomment-651719934, or unsubscribe https://github.com/notifications/unsubscribe-auth/ABOWA54M6R3LCRKEXCGSXM3RZHAFNANCNFSM4OKUV7KQ .

jthoma avatar Jul 01 '20 04:07 jthoma