import * as pdfjsLib from "pdfjs-dist";
import pdfjsWorker from "../pdfjs-static/pdf.worker.min.data";

pdfjsLib.GlobalWorkerOptions.workerSrc = pdfjsWorker; // version 2.12.313
// "https://cdnjs.cloudflare.com/ajax/libs/pdf.js/2.12.313/pdf.worker.min.js";
// "https://cdn.jsdelivr.net/npm/pdfjs-dist@2.12.313/build/pdf.worker.min.js";

const loadPDF = (input) => {
  return new Promise((resolve, reject) => {
    var reader = new FileReader();

    // Set up file reader
    reader.onload = function () {
      var arrayBuffer = reader.result;
      // console.log("in loadPDF")
      // console.log(arrayBuffer.byteLength);
      // console.log(arrayBuffer);
      loadPDFDoc(arrayBuffer).then((resp) => resolve(resp));
    };

    // Execute the file reading function
    reader.readAsArrayBuffer(input);
  });
};

const loadPDFDoc = (buffer) => {
  return new Promise((resolve, reject) => {
    const loadingTask = pdfjsLib.getDocument({ data: buffer });
    loadingTask.promise.then(function (pdf) {
      // console.log("in loadPDFDoc ... ", pdf)
      resolve(pdf);
      //   const pdfTextExtract = getPDFText(pdf);
      //   pdfTextExtract.then((data) => {
      //     resolve(data);
      //   });
    });
  });
};

const loadPDFFromURL = (url) => {
  return new Promise((resolve, reject) => {
    const loadingTask = pdfjsLib.getDocument(url);
    loadingTask.promise.then((pdf) => {
      resolve(pdf);
    });
  });
};

export const getPDFTextAllPages = (PDFDocumentInstance) => {
  const pdf = PDFDocumentInstance;

  var numPages = pdf.numPages;
  var listOfPromises = [];

  for (var p = 1; p <= numPages; p++) {
    var page = pdf.getPage(p);

    var txt = "";
    listOfPromises.push(
      page.then(function (page) {
        // add page promise
        var pageItems = [];
        var textContent = page.getTextContent();
        return textContent.then(function (textItems) {
          textItems.items.forEach((obj) =>
            pageItems.push({
              s: obj.str,
              eol: obj.hasEOL,
              x: obj.transform[4],
              y: obj.transform[5],
            })
          );
          return { pageItems: pageItems, pageIndex: page._pageIndex };
          // return textItems.items.map(function (s) {
          //   return s.str;
          // });
          //// .join(" ");
        });
      })
    );
  }
  // Wait for all pages and join text
  return Promise.all(listOfPromises).then(function (texts) {
    // console.log(texts)
    // return texts.join("");
    return texts;
  });
};

// Function to check if value exists in an array
function hasValue(inArray, inKey, value) {
  var i = null;

  if (inArray !== undefined) {
    for (i = 0; i < inArray.length; i++) {
      if (
        inArray[i] !== null &&
        inArray[i] !== undefined &&
        inArray[i].hasOwnProperty(inKey) &&
        inArray[i][inKey] === value
      ) {
        return i;
      }
    }
  }

  return -1;
}

// source: https://jsperf.com/count-string-occurrence-in-string/3
const occurrences = (substring, string) => {
  var n = 0;
  var pos = 0;
  var l = substring.length;
  var reNums = new RegExp(/\d+/);
  if (substring.length >= 5) {
    // && !reNums.test (substring)) {
    while (true) {
      pos = string.indexOf(substring, pos);
      if (pos > -1) {
        n++;
        pos += l;
      } else {
        break;
      }
    }
  }
  return n;
};

// fix some text errors such as extra spaces and hyphenation in reference string
const fixRefString = (refString) => {
  var str = refString;
  str = str.replace(/([a-z]+|[A-Z]+|[0-9]+)\s+(\.|\,)/g, "$1$2");

  // replace 'x,3' with 'x, 3'
  str = str.replace(
    /([a-z]+|[A-Z]+|[0-9]+)(\,)([a-z]+|[A-Z]+|[0-9]+)/g,
    "$1$2 $3"
  );

  // replace 'x.y' with 'x. y'
  str = str.replace(
    /([a-z]+|[A-Z]+|[0-9]+)(\.).([a-z]+|[A-Z]+|[0-9]+)/g,
    "$1$2 $3"
  );

  // replace 'x- y' with 'xy'
  str = str.replace(/([a-z]|[A-Z])(-\s)([a-z]|[A-Z]|[0-9])/g, "$1$3");

  return str;
};

const checkStartOfReferenceSection = (text) => {
  const tokens = text.split(" ");
  if (
    (tokens.length === 1 &&
      (tokens[0].toLowerCase().indexOf("references") > -1 ||
        tokens[0].toLowerCase().indexOf("biblio") > -1)) ||
    (tokens.length === 1 &&
      (tokens[0].toLowerCase().indexOf("eferences") > -1 ||
        tokens[0].toLowerCase().indexOf("referen") > -1 ||
        tokens[0].toLowerCase().indexOf("iblio") > -1)) ||
    (tokens.length === 2 &&
      (tokens[1].toLowerCase().indexOf("references") === 0 ||
        tokens[1].toLowerCase().indexOf("biblio") > -1))
  ) {
    return true;
  }
  return false;
};

const processReferences = (
  PDFDocumentInstance,
  referenceSectionStartsFromPage
) => {
  const pdf = PDFDocumentInstance;

  var numPages = pdf.numPages;

  if (numPages > 50) return null;

  var listOfPromises = [];
  //////////
  var debug = false;
  var foundNumberedReferences = false;
  var foundNonNumberedReferences = false;
  var refsArray = [];
  var newRef = false;
  var numRefs = 0;
  var refString = "";
  var pageLayoutStats = [];
  var pageMaxY = -1e6;
  var pageMinY = +1e6;
  var referenceContent = [];
  var regexpTerm1 = new RegExp(/appendix/g);
  var regexpTerm2 = new RegExp(/about\s+the\s+authors/g);
  var showBibliographyText = false;
  var endOfReferenceFound = false;
  var xPosMaxDiffForMerging = 7;
  var PATTERN_REFERENCE = new RegExp(
    /(^r\s*e\s*f\s*e\s*r\s*e\s*n\s*c\s*e\s*s)|(^r\s*e\s*f\s*e\s*r\s*e\s*n\s*)|(^[Rr]*\s*e\s*f\s*e\s*r\s*e\s*n\s*c\s*e\s*s)/gim
  );

  //////////
  var pages = [];
  for (var i = referenceSectionStartsFromPage; i < pdf.numPages; i++) {
    pages.push(i);
  }
  var referencesStartFound = false;
  return Promise.all(
    pages.map(function (pageNumber) {
      return pdf.getPage(pageNumber + 1).then(function (page) {
        return page.getTextContent().then(function (textContent) {
          var textItems = [];
          if (textContent.items)
            for (var tc = 0; tc < textContent.items.length; tc++) {
              textItems.push(textContent.items[tc]);
            }
          // for (var tc = 0; tc < textContent.items.length; tc++) {
          //   var line = textContent.items[tc].str.trim(); //.toLowerCase();
          //   var tokens = line.split(" ");
          //   // console.log(line)
          //   if (PATTERN_REFERENCE.test(line)) {
          //     console.log(
          //       'REFERENCE FOUND !!! "',
          //       line,
          //       '" page: ',
          //       page.pageNumber
          //     );
          //     referencesStartFound = true;
          //   }

          //   if (
          //     (tokens.length === 1 &&
          //       line.toLowerCase().indexOf("index") > -1) ||
          //     line.toLowerCase().indexOf("appendix") > -1 ||
          //     line.toLowerCase().indexOf("about the authors") > -1 ||
          //     refString.toLowerCase().indexOf("appendix") > -1 ||
          //     regexpTerm2.test(refString.toLowerCase())
          //   ) {
          //     console.log(
          //       'REFERENCE END FOUND !!! "',
          //       line,
          //       '" page: ',
          //       page.pageNumber
          //     );
          //     referencesStartFound = false;
          //   }

          //   if (referencesStartFound) {
          //     // console.log(line);
          //     textItems.push(textContent.items[tc]);
          //   }
          // }
          // return { p: pageNumber, textItems: textItems };
          return textItems;
        });
      });
    })
  ).then(function (pages) {
    var referencesStartFound = false;
    // for (var p = pages.length - 1; p >= 0; p--) {
    var p = 0;
    while (p <= pages.length - 1) {
      if (pages[p].length === 0) pages.splice(p, 1);
      for (var tc = 0; tc < pages[p].length; tc++) {
        var line = pages[p][tc].str.trim();
        var tokens = line.split(" ");

        if (
          checkStartOfReferenceSection(line) ||
          PATTERN_REFERENCE.test(line)
        ) {
          // console.log('REFERENCE FOUND !!! "', line, '" page: ', p);
          referencesStartFound = true;
        }

        // if (
        //   (tokens.length === 1 && line.toLowerCase().indexOf("index") > -1) ||
        //   line.toLowerCase().indexOf("appendix") > -1 ||
        //   line.toLowerCase().indexOf("about the authors") > -1
        // ) {
        //   console.log('REFERENCE END FOUND !!! "', line, '" page: ', p);
        //   referencesStartFound = false;
        // }
      }
      // if (!referencesStartFound) pages.splice(p, 1);
      p++;
    }
    // console.log("PAGES after pruning > ", pages);
    // return;
    // var lastRefAdded = false;
    var patternType = "";
    var pageMaxY = -1e6;
    var pageMinY = +1e6;
    var refsArray = [];
    var referencesStartFound = false;
    var referencesEndFound = false;
    var referencesStartPage = -1;
    var refStartX = -1;
    var refStartY = -1;
    // Test for references starting with "[#] ..."
    var PATTERN_NUMBERED_1 = new RegExp(/(^\[\s*([0-9\s*\.*a-zA-Z]*)\s*\])/);
    var PATTERN_NUMBERED_2 = new RegExp(/^([0-9]+\.)(?![0-9])(?![a-z])/);
    var PATTERN_LASTNAME_1 = new RegExp(/^([A-Z][a-z]*,\s[A-Z].)/gm);
    var processLines = false;

    const getPattern = (line) => {
      // console.log(">> line:: ", line);
      if (PATTERN_NUMBERED_1.test(line)) {
        // console.log("num refs 1")
        return "NUMBERED_REFS";
      } else if (PATTERN_NUMBERED_2.test(line)) {
        // console.log("num refs 2")
        return "NUMBERED_REFS";
      } else if (PATTERN_LASTNAME_1.test(line)) {
        // console.log("named ref")
        return "LASTNAME_REFS";
      } else {
        // console.log("NO PATTERN")
        return null;
      }
    };

    for (var p = 0; p < pages.length; p++) {
      //Math.floor(pages.length / 2)
      refStartX = -1;
      refStartY = -1;
      var pageLayoutStats = [];

      for (var tc = 0; tc < pages[p].length; tc++) {
        if (!pages[p] || !pages[p][tc] || !pages[p][tc].transform) continue;
        ///////////////////////////////////
        // Remove empty entries
        if (pages[p][tc].str.trim() === "") pages[p].splice(tc, 1);

        ///////////////////////////////////
        // Capture page layout data

        var refX = parseInt(pages[p][tc].transform[4].toFixed(0));
        var refY = parseInt(pages[p][tc].transform[5].toFixed(0));

        var foundAt = hasValue(pageLayoutStats, "xpos", refX);
        if (foundAt === -1) {
          pageLayoutStats.push({ xpos: refX, count: 1 });
        } else {
          pageLayoutStats[foundAt]["count"] += 1;
        }

        if (refY > pageMaxY) {
          pageMaxY = refY;
        }
        if (refY < pageMinY) {
          pageMinY = refY;
        }

        var line = pages[p][tc].str.trim().toLowerCase();
        line = line.replace(/\s\s+/g, " ");
        line = line.trim();
        var tokens = line.split(" ");

        if (PATTERN_REFERENCE.test(line)) {
          referencesStartFound = true;
          referencesStartPage = p;
        }
      }

      ///////////////////////////////////
      // Process page layout
      if (referencesStartFound && !referencesEndFound) {
        var maxCount = -1e6;
        pageLayoutStats.forEach(function (d) {
          if (d.count > maxCount) {
            maxCount = d.count;
          }
        });

        // prune the array 'pageLayoutStats' and retain only the most frequent x-positions
        var tmpLen = pageLayoutStats.length;
        while (tmpLen--) {
          if (pageLayoutStats[tmpLen].count < 0.1 * maxCount) {
            pageLayoutStats.splice(tmpLen, 1);
          } else if (pageLayoutStats[tmpLen].count < 5)
            pageLayoutStats.splice(tmpLen, 1);
        }

        pageLayoutStats.sort(function (a, b) {
          return a.xpos - b.xpos;
        });

        for (var it = 0; it < 2; it++) {
          tmpLen = pageLayoutStats.length;
          for (var i = 1; i < tmpLen; i++) {
            if (
              Math.abs(pageLayoutStats[i].xpos - pageLayoutStats[i - 1].xpos) <=
              xPosMaxDiffForMerging
            ) {
              pageLayoutStats[i - 1].count += pageLayoutStats[i].count;
              pageLayoutStats.splice(i, 1);
              --tmpLen;
            }
          }
        }

        // // sort in ascending order
        // pageLayoutStats.sort(function (a, b) {
        //   return -(a.count - b.count);
        // });

        // Collapse to one tuple if a 'column' of frequently occurring text
        //  has a very low frequency compared to high-frequency 'column' with
        //  high frequency of certain value of x-position. This is done to
        //  account for papers that have references in two columns but do not
        //  begin with numbers ('e.g., start with Author names instead of "[x] Alpha, B. ..."
        //  where 'x' is a alpha-numeric.
        if (pageLayoutStats.length >= 2) {
          tmpLen = pageLayoutStats.length;

          for (var i = 0; i < tmpLen - 1; i++) {
            if (pageLayoutStats[i].count - pageLayoutStats[i + 1].count > 100) {
              pageLayoutStats.splice(i + 1, 1);
              --tmpLen;
            }
          }
        }
        pageLayoutStats.sort((a, b) => {
          if (a.xpos < b.xpos) return -1;
          else if (a.xpos > b.xpos) return 1;
          else return 0;
        });
        // pageLayouts.push(pageLayoutStats);
        // console.log("page min/maxY: " + pageMinY + ", " + pageMaxY);
        // console.log("page layout stats: " + JSON.stringify(pageLayoutStats));
      }

      var newRef = null;
      var layout = null;
      var refPattern = null;
      for (var tc = 0; tc < pages[p].length; tc++) {
        if (!pages[p][tc]) continue;
        var line = pages[p][tc].str.trim().toLowerCase();
        line = line.replace(/\s\s+/g, " ");
        line = line.trim();

        if (!pages[p] || !pages[p][tc] || !pages[p][tc].transform) continue;
        var refX = pages[p][tc].transform[4].toFixed(0);
        var refY = pages[p][tc].transform[5].toFixed(0);

        if (checkStartOfReferenceSection(line)) {
          processLines = true;
          console.log("FOUND  1", line);
          continue;
        } else if (PATTERN_REFERENCE.test(line)) {
          processLines = true;
          console.log("FOUND  2", line);
          continue;
        }
        // if (referencesStartFound &&
        //   ((tokens.length === 1 && line.toLowerCase().indexOf("index") > -1) ||
        //   line.indexOf("appendix") > -1 ||
        //   line.indexOf("acknowledgments") > -1 ||
        //   line.indexOf("about the authors") > -1)
        // ) {
        //   console.log('REFERENCE END FOUND !!! "', line, '" page: ', p);
        //   console.log("new ref> ", newRef)
        //   referencesStartFound = false;
        //   referencesEndFound = true;
        //   processLines = false;
        // }

        if (processLines) {
          // Skip this line if it belongs to header or footer
          if (Math.abs(refY - pageMinY) < 5 || Math.abs(refY - pageMaxY) < 5) {
            continue;
          }
          // CASE 1: Single column references
          if (pageLayoutStats.length === 1) {
            if (PATTERN_NUMBERED_1.test(line)) {
              patternType = "NUMBERED_REFS";
            } else if (PATTERN_NUMBERED_2.test(line)) {
              patternType = "NUMBERED_REFS";
            }
            if (
              patternType !== "NUMBERED_REFS" &&
              PATTERN_LASTNAME_1.test(pages[p][tc].str.trim())
            ) {
              patternType = "LASTNAME_REFS";
            }
            if (newRef === null) {
              console.log(
                "REF PATTERN > > ",
                patternType,
                " > ",
                pageLayoutStats.length
              );
              refPattern = patternType;
            }
            if (
              patternType === "NUMBERED_REFS" ||
              patternType === "LASTNAME_REFS"
            ) {
              if (newRef !== null) {
                console.log("new ref>> 1:1", newRef);
                refsArray.push(newRef);
              }
            } else {
              newRef = line;
            }
          }

          // CASE: 2-Column reference (only one column found)
          else if (pageLayoutStats.length === 2) {
            if (
              Math.abs(refX - pageLayoutStats[0].xpos) < 10 &&
              Math.abs(pageLayoutStats[0].xpos - pageLayoutStats[1].xpos) < 25
            ) {
              const skip = getPattern(line) === null ? true : false;
              console.log(line, " skip ? ", skip);
              if (newRef !== null && !skip) {
                console.log("new ref>> 2:1", newRef);
                refsArray.push(newRef);
              }
              newRef = line;
            } else if (Math.abs(refX - pageLayoutStats[0].xpos) > 10) {
              newRef += " " + line;
            }
          }
          // CASE: 2-Column reference (two columns found)
          else if (pageLayoutStats.length === 3) {
            if (PATTERN_NUMBERED_1.test(line)) {
              patternType = "NUMBERED_REFS";
            } else if (PATTERN_NUMBERED_2.test(line)) {
              patternType = "NUMBERED_REFS";
            }
            if (
              patternType !== "NUMBERED_REFS" &&
              PATTERN_LASTNAME_1.test(pages[p][tc].str.trim())
            ) {
              patternType = "LASTNAME_REFS";
            }
            if (
              patternType === "NUMBERED_REFS" ||
              patternType === "LASTNAME_REFS"
            ) {
              // For the very first time, save out the type of reference pattern
              if (newRef === null) {
                console.log(
                  "REF PATTERN > > ",
                  patternType,
                  " > ",
                  pageLayoutStats.length
                );
                refPattern = patternType;
              }
            }
            if (Math.abs(refX - pageLayoutStats[0].xpos) < 10) {
              layout = "3:1";
            } else if (Math.abs(refX - pageLayoutStats[1].xpos) < 10) {
              layout = "3:2";
            } else if (Math.abs(refX - pageLayoutStats[2].xpos) < 10) {
              layout = "3:3";
            } else if (layout === "3:1") {
              var refPartSno = false;
              var refPartBody = false;
              refPartSno =
                Math.abs(refX - pageLayoutStats[0].xpos) < 10 ? true : false;
              refPartBody =
                Math.abs(refX - pageLayoutStats[1].xpos) < 10 ? true : false;
              if (refPartSno) {
                const skip = getPattern(line) === null ? true : false;
                if (newRef !== null && !skip) {
                  console.log("new ref>> 2:1", newRef);
                  refsArray.push(newRef);
                }
                newRef = line;
              } else if (refPartBody) {
                newRef += " " + line;
              }
            } else if (layout === "3:2") {
              var refPartSno = false;
              var refPartBody = false;
              refPartSno =
                Math.abs(refX - pageLayoutStats[1].xpos) < 10 ? true : false;
              refPartBody =
                Math.abs(refX - pageLayoutStats[2].xpos) < 10 ? true : false;

              if (refPartSno) {
                const skip = getPattern(line) === null ? true : false;
                if (newRef !== null && !skip) {
                  console.log("new ref>> 2:1", newRef);
                  // console.log("3:2- NEW ref: ", line);
                  refsArray.push(newRef);
                }
                newRef = line;
              } else if (refPartBody) {
                newRef += " " + line;
              }
            } else if (layout === "3:3") {
              newRef += " " + line;
            }
          } else if (pageLayoutStats.length === 4) {
            if (PATTERN_NUMBERED_1.test(line)) {
              patternType = "NUMBERED_REFS";
            } else if (PATTERN_NUMBERED_2.test(line)) {
              patternType = "NUMBERED_REFS";
            }
            if (
              patternType !== "NUMBERED_REFS" &&
              PATTERN_LASTNAME_1.test(pages[p][tc].str.trim())
            ) {
              patternType = "LASTNAME_REFS";
            }
            if (
              patternType === "NUMBERED_REFS" ||
              patternType === "LASTNAME_REFS"
            ) {
              // For the very first time, save out the type of reference pattern
              if (newRef === null) {
                console.log(
                  "REF PATTERN > > ",
                  patternType,
                  " > ",
                  pageLayoutStats.length
                );
                refPattern = patternType;
              }
            }
            // Check if refX is first column
            if (Math.abs(refX - pageLayoutStats[0].xpos) < 10) {
              layout = "2:1";
            } else if (Math.abs(refX - pageLayoutStats[2].xpos) < 10) {
              layout = "2:2";
            }

            if (layout === "2:1") {
              var refPartSno = false;
              var refPartBody = false;
              refPartSno =
                Math.abs(refX - pageLayoutStats[0].xpos) < 10 ? true : false;
              refPartBody =
                Math.abs(refX - pageLayoutStats[1].xpos) < 10 ? true : false;
              if (refPartSno) {
                const skip = getPattern(line) === null ? true : false;
                if (newRef !== null && !skip) {
                  // console.log("2:1 - NEW ref: ", line);
                  refsArray.push(newRef);
                }
                newRef = line;
                // console.log("> ", line, layout, "> sno: " , refPartSno, "> body: ", refPartBody)
              } else if (refPartBody) {
                newRef += " " + line;
              }
            } else if (layout === "2:2") {
              var refPartSno = false;
              var refPartBody = false;
              refPartSno =
                Math.abs(refX - pageLayoutStats[2].xpos) < 10 ? true : false;
              refPartBody =
                Math.abs(refX - pageLayoutStats[3].xpos) < 10 ? true : false;
              if (refPartSno) {
                const skip = getPattern(line) === null ? true : false;
                if (newRef !== null && !skip) {
                  console.log("2:2 - NEW ref: ", line);
                  refsArray.push(newRef);
                }
                newRef = line;
                // console.log("> ", line, layout, "> sno: " , refPartSno, "> body: ", refPartBody)
              } else if (refPartBody) {
                newRef += " " + line;
              }
            }
          }
        }
      }
      const skip = getPattern(newRef) === null ? true : false;
      if (newRef && !skip) refsArray.push(newRef);

      // process no more pages
      if (referencesEndFound) {
        const skip = getPattern(newRef) === null ? true : false;
        if (newRef && !skip) refsArray.push(newRef);
        break;
      }
    }
    console.log(">>>>>", newRef);

    console.log("REFERENCES: ", refsArray);
    return refsArray;
  });
};

const getPDFPageText = (pageNum, PDFDocumentInstance) => {
  // Return a Promise that is solved once the text of the page is retrieven
  return new Promise(function (resolve, reject) {
    PDFDocumentInstance.getPage(pageNum).then(function (pdfPage) {
      // The main trick to obtain the text of the PDF page, use the getTextContent method
      pdfPage.getTextContent().then(function (textContent) {
        var textItems = textContent.items;
        var finalString = "";

        // Concatenate the string of the item to the final string
        for (var i = 0; i < textItems.length; i++) {
          var item = textItems[i];

          finalString += item.str + " ";
        }

        // Solve promise with the text retrieven from the page
        resolve(finalString);
      });
    });
  });
};

const getPDFText = (pdf) => {
  return new Promise((resolve, reject) => {
    // Fetch the first page
    var pageNumber = 1;
    pdf.getPage(pageNumber).then(function (page) {
      getPDFPageText(pageNumber, pdf).then((data) => {
        resolve(data);
      });
    });
  });
};

const pdfProcessReferences = (PDFDocumentInstance) => {
  const pdf = PDFDocumentInstance;

  var numPages = pdf.numPages;
  var listOfPromises = [];
  var allPages = [];
  for (var p = 1; p <= numPages; p++) {
    var page = pdf.getPage(p);

    var txt = "";
    listOfPromises.push(
      page.then(function (page) {
        // add page promise
        var pageItems = [];
        var textContent = page.getTextContent();

        return textContent.then(function (textItems) {
          if (textItems.items) {
            textItems.items.forEach((obj) => pageItems.push(obj));
            return pageItems;
          }
        });
      })
    );
  }
  // Wait for all pages and join text
  return Promise.all(listOfPromises).then(function (pageContent) {
    return getReferences(pageContent);
  });
};

const getReferences = (pageWiseTextItems) => {
  const prunedList = pruneTextContent(pageWiseTextItems);
  // console.log("PRUNED ", prunedList);
  return buildReferenceSet(prunedList);
  return [];
};

const pruneTextContent = (pageWiseTextItems) => {
  var selectedItems = [];
  var referencesEndFound = false;
  var referencesStartFound = false;
  // var PATTERN_REFERENCE = new RegExp(
  //   /(^r\s*e\s*f\s*e\s*r\s*e\s*n\s*c\s*e\s*s\s*)|(^R\s*e\s*f\s*e\s*r\s*e\s*n\s*)|(^[Rr]*\s*e\s*f\s*e\s*r\s*e\s*n\s*c\s*e\s*s)/gim
  // );
  var PATTERN_REFERENCE = new RegExp(
    /(^r?\s*e\s*f\s*e\s*r\s*e\s*n\s*c\s*e?\s*s?\s*)|(^b\s*i\s*b\s*l\s*i\s*o\s*g\s*r\s*a\s*p\s*h\s*y\s*)/gim
  );
  var PATTERN_REFERENCE_END = new RegExp(
    /(^a\s*p\s*p\s*e\s*n\s*d\s*i\s*x\s*)|(^a\s*c\s*k\s*n\s*o\s*w\s*l\s*e\s*d\s*g\s*)/gim
  );

  var pagesWithReferences = [];
  var matchedPattern = "";

  for (var p = 0; p < pageWiseTextItems.length; p++) {
    // console.log("page > ", p + 1);
    for (var ti = 0; ti < pageWiseTextItems[p].length; ti++) {
      var line = pageWiseTextItems[p][ti].str.trim();
      if (line === "" || line === " ") continue;

      var patternOutput = PATTERN_REFERENCE.test(line);
      if (patternOutput) {
        // console.log("line>", line);
        pagesWithReferences.push(p);
      }
    }
  }

  if (pagesWithReferences.length > 0)
    for (
      var p = pagesWithReferences[pagesWithReferences.length - 1];
      p < pageWiseTextItems.length;
      p++
    ) {
      for (var ti = 0; ti < pageWiseTextItems[p].length; ti++) {
        var line = pageWiseTextItems[p][ti].str.trim();
        if (line === "" || line === " ") continue;

        var patternOutput = PATTERN_REFERENCE.test(line);
        if (patternOutput) {
          // if (
          //   pageWiseTextItems[p][ti] === "EFERENCES" ||
          //   pageWiseTextItems[p][ti] === "eferences"
          // ) {
          //   referencesStartFound = true;
          // } else if (
          //   pageWiseTextItems[p][ti] === "REFERENC" ||
          //   pageWiseTextItems[p][ti] === "Referenc"
          // ) {
          //   referencesStartFound = true;
          // } else if (
          //   (ti + 1) << pageWiseTextItems[p].length &&
          //   pageWiseTextItems[p][ti + 1] == ""
          // ) {
          // } else {
          //   referencesStartFound = true;
          // }
          referencesStartFound = true;
          matchedPattern = line;
          // continue;
        }
        if (
          referencesStartFound &&
          PATTERN_REFERENCE_END.test(line) &&
          line.length < 20
        ) {
          referencesEndFound = true;
          // console.log("REFS END found", line);
        }

        if (referencesStartFound && !referencesEndFound) {
          if (line === matchedPattern) continue;
          selectedItems.push(pageWiseTextItems[p][ti]);
        }
      }
    }

  // Remove the first element that might contain "references" or "bibliography"
  // selectedItems.shift();
  // console.log("Selected items: ", selectedItems);
  return selectedItems;
};

const buildReferenceSet = (textItems) => {
  var maxReferenceCharacterLength = 500;
  var pageLayoutStats = [];
  var pageMaxY = -1;
  var pageMinY = 1e10;
  var EXTRACT_NUMBER_FROM_REF = new RegExp(/^\[?(\d+)\.?/);
  var PATTERN_NUMBERED_1 = new RegExp(/(^\[\s*([0-9\s*\.*]*)\s*\])/gim);
  var PATTERN_NUMBERED_2 = new RegExp(/^([0-9]+\.)(?![0-9])(?![a-z])/);
  var PATTERN_EXCEPTIONS = new RegExp(/(^\[[A-Z]\]\.)/);
  var PATTERN_YEAR = new RegExp(/^[0-9]{4}/);
  var PATTERN_LASTNAME_1 = new RegExp(
    /(^[A-Z][a-z]*),?(\s*([A-Z].?[a-z]*)*).?/
    // /^([A-Z][a-z]*,?\s?([A-Z].)*\s*([A-Z][a-z]*.?,?)*)|^(van\s)/
  );
  //gm // old -> /^([A-Z][a-z]*,\s?([A-Z].)*)|^(van\s)/gim

  var PATTERN_LASTNAME_2 = new RegExp(
    /(^[A-Z][a-z]*\s[A-Z]*\,)|(^[A-Z][a-z]*\s[A-Z]*\,?(\s\(\d+\)))/gim
  );

  var PATTERN_FIRSTNAME_2 = new RegExp(/(^[A-Z]\.[A-Z]*\.\,)|^([A-Z]\.\,)/gim);
  var PATTERN_LASTNAME_2_TRUNCATED = new RegExp(/^([A-Z][a-z]*\s\$)/gm);
  var pageMinX = 10000;
  var pageMaxX = -1;

  for (var ti = 0; ti < textItems.length; ti++) {
    if (textItems[ti].transform) {
      if (textItems[ti].transform[4] > pageMaxX)
        pageMaxX = textItems[ti].transform[4];
      if (textItems[ti].transform[4] < pageMinX)
        pageMinX = textItems[ti].transform[4];

      if (textItems[ti].transform[5] > pageMaxY)
        pageMaxY = textItems[ti].transform[5];
      if (textItems[ti].transform[5] < pageMinY)
        pageMinY = textItems[ti].transform[5];

      var refX = parseInt(textItems[ti].transform[4].toFixed(0));
      var refY = parseInt(textItems[ti].transform[5].toFixed(0));
      var foundAt = hasValue(pageLayoutStats, "xpos", refX);
      if (foundAt === -1) {
        pageLayoutStats.push({ xpos: refX, count: 1 });
      } else {
        pageLayoutStats[foundAt]["count"] += 1;
      }
      // console.log(textItems[ti].str, " > ", refX, refY);
    }
  }
  // console.log("Min/Max X", pageMinX, pageMaxX);
  // console.log("Page layout stats: ", pageLayoutStats);

  var pageLayoutMinX = 1e10;
  var pageLayoutMaxX = -1;
  var pageLayoutMinY = 1e10;
  var pageLayoutMaxY = -1;

  for (var ti = 0; ti < textItems.length; ti++) {
    if (textItems[ti].transform) {
      if (textItems[ti].transform[4] > pageLayoutMaxX)
        pageLayoutMaxX = textItems[ti].transform[4];
      if (textItems[ti].transform[4] < pageLayoutMinX)
        pageLayoutMinX = textItems[ti].transform[4];

      if (textItems[ti].transform[5] > pageLayoutMaxY)
        pageLayoutMaxY = textItems[ti].transform[5];
      if (textItems[ti].transform[5] < pageLayoutMinY)
        pageLayoutMinY = textItems[ti].transform[5];

      var refX = parseInt(textItems[ti].transform[4].toFixed(0));
      var refY = parseInt(textItems[ti].transform[5].toFixed(0));
      var foundAt = hasValue(pageLayoutStats, "xpos", refX);
      if (foundAt === -1) {
        pageLayoutStats.push({ xpos: refX, count: 1 });
      } else {
        pageLayoutStats[foundAt]["count"] += 1;
      }
      // console.log(textItems[ti].str, " > ", refX, refY);
    }
  }
  const pageLayout = {
    maxX: pageLayoutMaxX,
    minX: pageLayoutMinX,
    maxY: pageLayoutMaxY,
    minY: pageLayoutMinY,
    midX: Math.abs(0.5 * (pageLayoutMaxX - pageLayoutMinX)),
    midY: Math.abs(0.5 * (pageLayoutMaxY - pageLayoutMinY)),
  };
  // console.log("Page layout: ", pageLayout);

  ///////////////////////////////////
  // Process page layout
  if (1 === 0) {
    var maxCount = -1e6;
    var xPosMaxDiffForMerging = 7;
    pageLayoutStats.forEach(function (d) {
      if (d.count > maxCount) {
        maxCount = d.count;
      }
    });

    // prune the array 'pageLayoutStats' and retain only the most frequent x-positions
    var tmpLen = pageLayoutStats.length;
    console.log(
      "pageLayout.len before pruning",
      pageLayoutStats.length,
      " maxCount: ",
      maxCount
    );
    while (tmpLen--) {
      if (pageLayoutStats[tmpLen].count < 0.1 * maxCount) {
        pageLayoutStats.splice(tmpLen, 1);
      } else if (pageLayoutStats[tmpLen].count < 5)
        pageLayoutStats.splice(tmpLen, 1);
    }
    console.log("pageLayout.len AFTER pruning", pageLayoutStats.length);

    pageLayoutStats.sort(function (a, b) {
      return a.xpos - b.xpos;
    });

    for (var it = 0; it < 2; it++) {
      tmpLen = pageLayoutStats.length;
      for (var i = 1; i < tmpLen; i++) {
        if (
          Math.abs(pageLayoutStats[i].xpos - pageLayoutStats[i - 1].xpos) <=
          xPosMaxDiffForMerging
        ) {
          pageLayoutStats[i - 1].count += pageLayoutStats[i].count;
          pageLayoutStats.splice(i, 1);
          --tmpLen;
        }
      }
    }

    // // sort in ascending order
    // pageLayoutStats.sort(function (a, b) {
    //   return -(a.count - b.count);
    // });

    // Collapse to one tuple if a 'column' of frequently occurring text
    //  has a very low frequency compared to high-frequency 'column' with
    //  high frequency of certain value of x-position. This is done to
    //  account for papers that have references in two columns but do not
    //  begin with numbers ('e.g., start with Author names instead of "[x] Alpha, B. ..."
    //  where 'x' is a alpha-numeric.
    if (pageLayoutStats.length >= 2) {
      tmpLen = pageLayoutStats.length;

      for (var i = 0; i < tmpLen - 1; i++) {
        if (pageLayoutStats[i].count - pageLayoutStats[i + 1].count > 100) {
          pageLayoutStats.splice(i + 1, 1);
          --tmpLen;
        }
      }
    }
    pageLayoutStats.sort((a, b) => {
      if (a.xpos < b.xpos) return -1;
      else if (a.xpos > b.xpos) return 1;
      else return 0;
    });
    var xposArray = [];
    if (pageLayoutStats.length > 0) xposArray.push(pageLayoutStats[0]);

    var diffs = [];
    for (var i = 0; i < pageLayoutStats.length - 1; i++) {
      // console.log(
      //   pageLayoutStats[i].xpos,
      //   "<>",
      //   pageLayoutStats[i + 1].xpos,
      //   "x pos diff > ",
      //   Math.abs(pageLayoutStats[i].xpos - pageLayoutStats[i + 1].xpos)
      // );
      diffs.push(
        Math.abs(pageLayoutStats[i].xpos - pageLayoutStats[i + 1].xpos)
      );
    }
    diffs.sort((a, b) => a - b);
    // console.log("Diffs>", diffs);
    var maxDiff = diffs[diffs.length - 1];
    // console.log("max diff: ", maxDiff);
    for (var ti = 0; ti < textItems.length - 1; ti++) {
      const diff = textItems[ti].transform[5] - textItems[ti + 1].transform[5];
      // console.log("diff> ", diff.toFixed(2), textItems[ti].str, " <> ", textItems[ti+1].str);
      // if (Math.abs(diff) > 100) {
      //   console.log(
      //     "line 1> ",
      //     "\n str 1: ",
      //     textItems[ti],
      //     "\n str 2: ",
      //     textItems[ti + 1],
      //     diff
      //   );
      //   console.log(
      //     "X diff> ",
      //     Math.abs(textItems[ti].transform[4] - textItems[ti + 1].transform[4])
      //   );
      // }
    }
    // pageLayouts.push(pageLayoutStats);
    // console.log("page min/maxY: " + pageMinY + ", " + pageMaxY);
    // console.log("page layout stats: ", pageLayoutStats);
  }

  var firstRefType = null;
  var refItem = "";
  var refList = [];
  var currentRefSequenceNo = -1;
  var prevRefSequenceNo = -1;
  const str = "ÁÉÍÓÚáéíóúâêîôûàèìòùÇç";

  const cleanupText = (text) => {
    if (text.length > maxReferenceCharacterLength) {
      text = text.substring(0, maxReferenceCharacterLength) + "..";
    }
    text = text.replace(/\s\$\s/gim, "");
    text = text.replace(/(\s\.)/gim, ".");
    text = text.replace(/(\s\,)/gim, ",");
    text = text.replace(/\s\s+/g, " ");
    return text;
  };

  // console.log(textItems);

  const _debug_refs_ = false;
  var diffStartX = 0;
  var refStartX = -1;
  var newColumn = false;
  var prevY = -1;
  var col2X = -1;
  for (var ti = 0; ti < textItems.length; ti++) {
    // console.log("line: ", textItems[ti])
    if (!textItems[ti].transform) continue;
    var line = textItems[ti].str.trim();
    line = line.normalize("NFD").replace(/[\u0300-\u036f]/g, "$");

    var refX = parseInt(textItems[ti].transform[4].toFixed(0));
    var refY = parseInt(textItems[ti].transform[5].toFixed(0));
    if (ti > 0) prevY = parseInt(textItems[ti - 1].transform[5].toFixed(0));
    if (ti > 0 && Math.abs(refY - prevY) > pageLayout.midY && !newColumn) {
      // console.log("COLUMMMMMMMM");
      // console.log(
      //   "prev line: ",
      //   textItems[ti - 1].str,
      //   "\nnext line: ",
      //   textItems[ti].str
      // );
      newColumn = true;
      col2X = refX;
      // console.log("================");
    }
    var patternType = null;
    var patternSubType = null;
    // if (newColumn) {
    //   console.log(
    //     "LINE> ",
    //     line,
    //     "\nrefX: ",
    //     refX.toFixed(0),
    //     " midX: ",
    //     pageLayout.midX.toFixed(0),
    //     " col2X: ",
    //     col2X.toFixed(0),
    //     " calc(refX-col2X)",
    //     Math.abs(refX - col2X).toFixed(0)
    //   );
    // }

    // Check if multi column
    var refStart = false;
    if (!newColumn && Math.abs(pageLayout.minX - refX) <= 5) {
      refStart = true;
    } else if (newColumn && Math.abs(refX - col2X) <= 20) {
      refStart = true;
    } else {
      refStart = false;
    }

    {
      // // Skip this line if it belongs to header or footer
      // if (Math.abs(refY - pageMinY) < 5 || Math.abs(refY - pageMaxY) < 5) {
      //   console.log(
      //     "continue, ref XY",
      //     refY,
      //     "<>",
      //     pageMinY,
      //     "& ",
      //     refY,
      //     "<>",
      //     pageMaxY
      //   );
      //   console.log("y - pageMinY", Math.abs(refY - pageMinY));
      //   console.log("y - pageMaY", Math.abs(refY - pageMaxY));
      //   continue;
      // }
      if (PATTERN_YEAR.test(line)) {
        patternType = "PATTERN_YEAR";
        if (_debug_refs_)
          console.log("[MATCH 1]: ", patternType, " line: ", line);
      } else if (PATTERN_NUMBERED_1.test(line)) {
        patternType = "NUMBERED_REFS";
        if (_debug_refs_)
          console.log(
            "[MATCH PATTERN_NUMBERED_1]: ",
            patternType,
            " line: ",
            line
          );
      } else if (PATTERN_NUMBERED_2.test(line)) {
        patternType = "NUMBERED_REFS";
        if (_debug_refs_)
          console.log(
            "[MATCH PATTERN_NUMBERED_2]: ",
            patternType,
            " line: ",
            line
          );
      }

      if (patternType !== "NUMBERED_REFS" && PATTERN_LASTNAME_1.test(line)) {
        patternType = "LASTNAME_REFS";
        if (_debug_refs_)
          console.log("[MATCH 4]: ", patternType, " line: ", line);
      }
      //// DO NO DELETE >>>>
      // else if (
      //   patternType !== "NUMBERED_REFS" &&
      //   PATTERN_LASTNAME_2.test(line)
      // ) {
      //   patternType = "LASTNAME_REFS";
      //   if (_debug_refs_)
      //   console.log("[MATCH 5]: ", patternType, " line: ", line);
      // }
      // else if (firstRefType === "LASTNAME_REFS" && refStart) {
      //   console.log("BOLE SO NIHAAL!!! >>>>>>>>>>>>>>>>>>>>> ");
      //   console.log("LINE > ", line);
      //   // patternType = "LASTNAME_REFS";
      // }
      //// DO NO DELETE <<<<<

      // if (_debug_refs_)
      // console.log("pattern type: ", patternType, " line: ", line);

      if (
        patternType !== "NUMBERED_REFS" &&
        PATTERN_LASTNAME_2_TRUNCATED.test(line)
      ) {
        if (_debug_refs_) console.log("[MATCH] last name truncated: ", line);
        patternType = "LASTNAME_REFS";
        patternSubType = "LASTNAME_TRUNCATED";
      }

      // For numbered references, capture the sequence number; this
      // is to handle cases when a spurious numbered reference is detected
      // but is not in proper sequence. Currently a simple diff is done
      // to determine if the sequence number is in order.
      // E.g., [1.] followed by detected '318.', the latter being a spurious
      // sequence nubmer detected.
      if (patternType === "NUMBERED_REFS") {
        var match = EXTRACT_NUMBER_FROM_REF.exec(line);

        if (match && match.length > 1) {
          currentRefSequenceNo = parseInt(match[1]);
        }
      }

      // For the very first time, find type of reference and freeze it
      if (
        !firstRefType &&
        (patternType === "NUMBERED_REFS" ||
          (patternType === "LASTNAME_REFS" && refStart))
      ) {
        // console.log("******** First reftype> ", patternType, " line: ", line);
        firstRefType = patternType;
      }

      if (
        patternType === firstRefType &&
        (patternType === "NUMBERED_REFS" ||
          (patternType === "LASTNAME_REFS" && refStart)) /*&& diffStartX <= 4*/
      ) {
        if (_debug_refs_) console.log("[in ref START block]");

        // Test for exceptions
        var exceptionDetected = PATTERN_EXCEPTIONS.test(line);
        if (prevRefSequenceNo !== -1 && currentRefSequenceNo !== -1) {
          var diff = currentRefSequenceNo - prevRefSequenceNo;
          if (diff !== 1) {
            exceptionDetected = true;
          }
        }
        // if (!exceptionDetected)
        {
          // if (_debug_refs_) console.log("refItem", refItem);
          refItem = cleanupText(refItem);

          if (refItem !== "") {
            // console.log("----\n Ref> ", refItem);
            refList.push({ text: refItem, fav: false });

            if (patternType === "NUMBERED_REFS") {
              prevRefSequenceNo = currentRefSequenceNo;
              currentRefSequenceNo = -1;
            }
          }

          // Reset reference item string
          refItem = line;
        }
      } else {
        // Build reference item
        if (firstRefType === null) {
          // DO NOTHING
        } else {
          refItem += " " + line;
          // console.log("+ + + refItem", refItem);
        }
      }
    }
  }
  // console.log("Last> ", refItem);
  if (refItem !== "") {
    refItem = cleanupText(refItem);
    refList.push({ text: refItem, fav: false });
  }
  // if (_debug_refs_)
  // console.log("REFERENCES >", refList);

  return refList;
};

/******************* NOT USED **************************/
// const openFileReadAsArrayBuffer = function (event) {
//   var input = event.target;
//   var reader = new FileReader();
//   reader.onload = function () {
//     // var arrayBuffer = reader.result;
//     // console.log(arrayBuffer.byteLength);
//   };
//   reader.readAsArrayBuffer(input.files[0]);
// };
// const openFileReadAsDataURL = function (event) {
//   var input = event.target;
//   var reader = new FileReader();
//   reader.onload = function () {
//     var dataURL = reader.result;
//     var output = document.getElementById("output2");
//     output.src = dataURL;
//   };
//   reader.readAsDataURL(input.files[0]);
// };
// const openFileReadAsText = function (event) {
//   var input = event.target;
//   var reader = new FileReader();
//   reader.onload = function () {
//     var text = reader.result;
//     // console.log(reader.result.substring(0, 200));
//   };
//   reader.readAsText(input.files[0]);
// };
/**************************************************/

export {
  getPDFText,
  getPDFPageText,
  loadPDFDoc,
  loadPDF,
  loadPDFFromURL,
  pdfProcessReferences,
  processReferences,
};
