Blog Code
Outline


Home

Parsing Some Simple LaTeX




Some LaTeX doesn't get postfixed easily so we need special parsing to handle these cases. This post will be updated as more cases are handled and changes are made.

Matrices

LaTeX matrices should be converted to multi-dimensional arrays for simple postfixing. A LaTeX matrix could look something like "\begin{bmatrix} 1 & 2 & 3 \\ 4 & 5 & 6 \end{bmatrix}".

We will search for the substring "\begin" and then check that the inside part is a matrix. Then we just grab everything before we hit the "\end{". For now we will just split the rows at "\\" and the columns at "&".

function parseLatexMatrix(input){
    var key = "";
    var openPar = 1;
    var idx = 0;
    var val = "";
    for (var i=0;i<input.length;i++){
        if (input[i] == "{"){
            return false;
        }
        else if (input[i] == "}"){
            key = input.substring(0,i);
            idx = i+1;
            break;
        }
    }
    var end = input.indexOf("\\end{"+key+"}");
    var begin = input.indexOf("\\begin{"+key+"}",idx);
    if (end < 0){
        return false;
    }
    if (begin == -1 || begin > end){
        val = input.substring(idx,end);
    }
    else {
        return false;
    }
    if (key.indexOf("matrix") > -1){
        var rows = val.trim().split("\\\\");
        var out = "[";
        for (var i=0;i<rows.length;i++){
            if (rows[i].trim().length == 0){continue;}
            var row = "[";
            row += rows[i].trim().replace(/&/g,", ")+'';
            row += "]";
            if (i > 0){
                out += ", ";
            }
            out += row;
        }
        out += "]";
        out += input.substring(end+6+key.length);
        return out;
    }
    return false;
}

Derivatives

Many derivatives will look like "\frac{d}{dx}[]" or "\frac{\mathrm{d}}{\mathrm{d}x}[]".

We will look for "\frac" and then check if there is either a \mathrm{d} or d in the the numerator. We need to determine the variable of differention by taking what is left in the denominator once the d is removed. Then just grab what is inside the []. Our output will be of the form "der(f(x),x)" to be easily postfixed.

We will also add a check for "\partial" later.

function parseLatexDerivatives(input){
    var openPar = 1;
    var parCount = 0;
    var bottomStr = "";
    var dx = "x";
    var insideStr = "";
    for (var i=0;i<input.length;i++){
        if (input[i] == "{" || input[i] == "["){
            openPar++;
        }
        else if (input[i] == "}" || input[i] == "]"){
            openPar--;
            if (openPar == 0){
                if (parCount > 0){
                      bottomStr = bottomStr.substring(0,i-parCount);
                      if (bottomStr.substring(0,10) == "\\mathrm{d}"){
                        dx = bottomStr.substring(10);
                        insideStr = input.substring(i+2);
                        openPar = 0;
                        parCount = -1*(i+2);
                      }
                      else if (bottomStr[0] == "d"){
                        dx = bottomStr.substring(1);
                        insideStr = input.substring(i+2);
                        openPar = 0;
                        parCount = -1*(i+2);
                      }
              		else {
                        return "\\frac{"+input;
                      }
                   }
                else if (parCount < 0){
                      insideStr = insideStr.substring(0,i+parCount);
                      input = "der("+insideStr+","+dx+")"+input.substring(i+1);
                      return input;
                }
                else {
                      if (input.substring(0,i) == "d" || input.substring(0,i) == "\\mathrm{d}"){
                        bottomStr = input.substring(i+2);
                        openPar = 0;
                        parCount = i+2;
                      }
                      else {
                        return "\\frac{"+input;
                      }
                }
            }
        }
    }
    return "\\frac{"+input;
}

Integrals

Many integrals are "\int_a^b f(x) \mathrm{d}x". We will look for the "\int" and then try to figure out the bounds, the function, and the variable of integration.

We want the bounds if they are included and will look for singles as well as more complex expressions in {}. The end should be demarked by either \mathrm{d} or d. We will assume the variable of integration is just one character but should probably include a more robust parsing for flexibility. Everything in between the bounds and the "d" is the function getting integrated.

Integrals might include formatting options like "\!" or "\," so we will also remove those.

function parseLatexInt(input){
    var sub = "";
    var sup = "";
    var inside = "";
    var rest = "";
    var dx = "x";
    var subi = 0;
    var supi = 0;
    if (input[0] == "_"){
        if (input[1] == "{"){
            var openPar = 1;
            sub = "";
            for (var i=2;i<input.length;i++){
                if (input[i] == "{"){
                    openPar++;
                    sub += input[i];
                }
                else if (input[i] == "}"){
                    openPar--;
                    if (openPar == 0){
                        subi = i+1;
                        break;
                    }
                    sub += input[i];
                }
                else {
                    sub += input[i];
                }
            }
        }
        else {
            sub = input[1];
            subi = 2;
        }
        if (input[subi] != "^"){
            supi = subi;
        }
        else {
            subi++;
            if (input[subi] == "{"){
                var openPar = 1;
                sup = "";
                for (var i=subi+1;i<input.length;i++){
                    if (input[i] == "{"){
                        openPar++;
                        sup += input[i];
                    }
                    else if (input[i] == "}"){
                        openPar--;
                        if (openPar == 0){
                            supi = i+1;
                            break;
                        }
                        sup += input[i];
                    }
                    else {
                        sup += input[i];
                    }
                }
            }
            else {
                sup = input[subi];
                supi = subi+1;
            }
        }
    }
    var dindex = input.substring(supi).indexOf('\\mathrm{d}');
    if (dindex < 0){
        dindex = input.substring(supi).indexOf('d');
        if (dindex < 0){
            inside = input.substring(supi).replace(/\\\,/g,"").replace(/\\\!/g,"");
            dx = "x";
        }
        else {
            inside = input.substring(supi,dindex+supi).replace(/\\\,/g,"").replace(/\\\!/g,"");
            rest = input.substring(dindex+supi+2);
            dx = input.substring(supi)[dindex+1];
        }
    }
    else {
        inside = input.substring(supi,dindex+supi).replace(/\\\,/g,"").replace(/\\\!/g,"");
        rest = input.substring(dindex+supi+11);
        dx = input.substring(supi)[dindex+10];
    }
    var out = "int("+inside+","+dx;
    if (sub != "" || sup != ""){
        out += ","+sub+","+sup;
    }
    out += ")";
    return out+rest;
}

Roots

The string "sqrt" usually refers to the normal square root function, but in LaTeX it can be modified to handle cube roots and more with "\sqrt[3]{x}". I want the sqrt function to take just one input so we need to combine the expression into something like "\sqrt{3,x}". We just need to find the end of the [] segment and move what is inside to the {}.

Limits and More

Several LaTeX expressions depend on whitespace for correct parsing. For limits, sums, and products we will perform a replacement before replacing whitespace. We will basically look for the next two groups of whitespace and replace them with "{" and "}" to keep the expression properly grouped. Then we can handle these functions similarly to integrals looking for the bounds and everything.

function nextSpaces(input) {
  var openPar = 0;
  var firstSpace = 0;
  for (var i=0;i<input.length;i++){
    if (input[i] == "(" || input[i] == "{" || input[i] == "["){
      openPar++;
    }
    else if (input[i] == ")" || input[i] == "}" || input[i] == "]"){
      openPar--;
    }
    if (openPar == 0){
      if (firstSpace == 1){
        if (input[i] != " " && input[i] != "\t" && input[i] != "\n"){
          firstSpace = -1;
          input=input.substring(0,i-1)+"{"+input.substring(i);
        }
        continue;
      }
      if (input[i] == " " || input[i] == "\t" || input[i] == "\n"){
        if (firstSpace == 0){
          firstSpace = 1;
        }
        else {
          input=input.substring(0,i)+"}"+input.substring(i+1);
          break;
        }
      }
    }
    else if (openPar < 0){
      if (firstSpace == -1){
        input=input.substring(0,i)+"}"+input.substring(i);
        break;
      }
    }
    if (firstSpace == -1 && i == input.length - 1){
      input += "}";
      break;
    }
  }
  return input;
}

Final Product

View this repl for the up-to-date parsing code. To learn more about the cleaning steps before parsing LaTeX, read this post. To learn more about postfixing the result of this step, read this post.

Or to just see it in action, enter a LaTeX expression below and it will hopefully get converted if it is similar to something above.


Output: ...