Wikipedista:Teslaton/dump-parser-sample.php

<?php

/*
  dump-parser-sample.php

  Wikipedia dump parsing example

  Prerequisites/notes:
    - CLI version of PHP installed with the following extensions available:
      - pcre, mbstring, bz2, xml
    - current latest-pages-articles dump from http://dumps.wikimedia.org/cswiki/latest/ present in:
      - dumps/cswiki-latest-pages-articles.xml  (or .xml.bz2)
    - call using:
      - php dump-parser-sample.php tpl-no-doc
    - see @TODO tagged comments to customize/extend

  Teslaton <http://sk.wikipedia.org/wiki/Redaktor:Teslaton>, 2009-2015
*/

error_reporting (E_ALL & ~E_NOTICE & ~E_STRICT);


/* 
  Some inlined library functions 
*/

// verbose message
function verbose($msg, $title = NULL)
{
  echo ($ATitle ? "{$title}: " : '') . $msg . "\n";
  return $msg;
}

// verbose message fmt
function verbosef($formatStr, $p1 = NULL, $p2 = NULL, $p3 = NULL, $p4 = NULL, $p5 = NULL, $p6 = NULL, $p7 = NULL, $p8 = NULL, $p9 = NULL)
{
  return verbose(sprintf($formatStr, $p1, $p2, $p3, $p4, $p5, $p6, $p7, $p8, $p9));
}

// debug dump of a variable
function vd($v, $title = NULL)
{
  echo ($title ? "{$title}: " : '') . var_export($v, true) . "\n";
  return $v;
}

// var dump and die
function vdd($v = NULL, $title = NULL)
{
  vd($v, $title);
  die(1);
}


/*
   SAX wrapper
   Handling low-level XML element/content callbacks, collecting XML-serialized
   data and calling high-level page handler callback.
*/

class WikiDumpLatestPagesXmlParser
{
  var $parser;
  var $elementStack = array();
  var $currentElement;

  var $errorPrefix;
  var $errors = array();
  var $verbose = false;

  var $pageHandler = null;  // callback to call on page match
  var $structs = array();

  var $pageCount = 0;
  var $break = false;

  function error($message)
  {
    if (is_resource($this->parser))
      $message = sprintf("%s (line %d, column %d)", $message, xml_get_current_line_number($this->parser), xml_get_current_column_number($this->parser));

    $this->errors[] = $this->errorPrefix . $message;
    // trigger_error($message, E_USER_ERROR);
  }

  function startElement($parser, $name, $attrs)
  {
    if ($this->_xmlDump) {
      $attrDump = ""; foreach ($attrs as $n => $v) $attrDump .= ' ' . $n . '="' . $v . '"';
      echo "\n" . str_repeat("  ", $this->elementDepth) . "<{$name}{$attrDump}>";
    }

    $parentElement = end($this->elementStack);
    switch ($name) {
      case 'page':
        $this->structs['page'] = array();
        break;
      case 'revision':
        $this->structs['revision'] = array();
        break;
      case 'contributor':
        $this->structs['contributor'] = array();
        break;
      case 'text':
        if (isset($this->structs[$parentElement]))
          $this->structs[$parentElement][$name] = $attrs;
        break;
      case 'redirect':
        if (isset($this->structs[$parentElement]))
          $this->structs[$parentElement][$name] = $attrs;
        break;
    }

    $this->elementCount++;
    $this->elementDepth++;
    $this->content = '';
    $this->currentElement = $name;
    array_push($this->elementStack, $name);
  }

  function endElement($parser, $name)
  {
    $this->elementDepth--;
    if ($this->_xmlDump)
      echo "\n" . str_repeat("  ", $this->elementDepth) . "</{$name}>";

    $stackElement = array_pop($this->elementStack);
    if ($stackElement != $name)
      $this->error("Closing unexpected element: </{$stackElement}> expected, </{$name}> found.");

    $parentElement = end($this->elementStack);
    switch ($name) {
      case 'title':
      case 'ns':
      case 'id':
      case 'timestamp':
      case 'comment':
      case 'ip':
      case 'username':
      case 'parentid':
      case 'minor':
        if (isset($this->structs[$parentElement]))
          $this->structs[$parentElement][$name] = $this->content;
        break;
      case 'redirect':
        // nothing, attributes saved in startElement()
        break;
      case 'text':
        // vdd($this->content);
        if ($this->structs[$parentElement]['text'])
          $this->structs[$parentElement]['text']['content'] = $this->content;
        break;
      case 'contributor':
        $this->structs['revision']['contributor'] = $this->structs['contributor'];
        break;
      case 'revision':
        if (!isset($this->structs['page']['revisionCount']))
          $this->structs['page']['revisionCount'] = 0;
        $this->structs['page']['revisionCount']++;
        $isFirstRevision = !isset($this->structs['page']['revisions']);
        if ($isFirstRevision) {
          $this->structs['page']['revisions'] = array();
          $this->structs['page']['revisions'][] = $this->structs['revision'];
        } else {
          $this->structs['page']['revisions'][1] = $this->structs['revision'];
        }
        break;
      case 'page':
        $this->pageCount++;
        call_user_func($this->pageHandler, $this->structs['page']);
        break;
      default:
        switch ($name) {
          case 'model':
          case 'format':
          case 'sha1':
            break;
          default:
            verbose("Unhandled </{$name}>");
        }
        break;
    }

    $this->content = '';
    $this->currentElement = end($this->elementStack);
  }

  function characterData($parser, $data)
  {
    if ($this->_xmlDump && ($markup = trim($data)) !== '')
      echo ($this->content === '' ? "\n" . str_repeat("  ", $this->elementDepth) : "") . $markup;

    $this->content .= $data;
  }

  function open($fileName)
  {
    $this->fileName = $fileName;
    $this->fileNameWrap =
      (preg_match("@\.gz$@", $this->fileName) ? "compress.zlib://" . $this->fileName :
      (preg_match("@\.bz2$@", $this->fileName) ? "compress.bzip2://" . $this->fileName :
      $this->fileName));

    $this->fileHandle = fopen($this->fileNameWrap, 'r');
    if (!$this->fileHandle)
      throw new Exception("Failed to open XML dump {$fileName}");
  }

  function close()
  {
    if ($this->fileHandle) {
      fclose($this->fileHandle);
      $this->fileHandle = null;
    }
    if ($this->parser) {
      xml_parser_free($this->parser);
      $this->parser = null;
    }
  }

  function parse()
  {
    $this->parser = $parser = xml_parser_create();
    xml_parser_set_option($parser, XML_OPTION_CASE_FOLDING, 0);
    xml_set_element_handler($parser, array($this, "startElement"), array($this, "endElement"));
    xml_set_character_data_handler($parser, array($this, "characterData"));

    $this->break = false;
    $this->_maxElements = null;
    // $this->_maxElements = 1000000;
    $this->_xmlDump = false;
    // $this->_xmlDump = true;

    $bytesRead = 0;
    while (($data = fread($this->fileHandle, 16*1024)) !== false && $data !== '') {
      $bytesRead += strlen($data);
      if (!xml_parse($parser, $data, feof($this->fileHandle))) {
        throw new Exception(sprintf("XML error: %s at line %d",
          xml_error_string(xml_get_error_code($parser)),
          xml_get_current_line_number($parser)));
      }
      // verbosef("%s MB, %s elements", number_format($bytesRead / 1024 / 1024, 3), number_format($this->elementCount, 0, '.', ' '));
      if (abs(time() - $lastDumpTime) >= 1) {
        verbosef("%s MB, %s elements, %s pages, %s MB RAM", number_format($bytesRead / 1024 / 1024, 3), number_format($this->elementCount, 0, '.', ' '), number_format($this->pageCount, 0, '.', ' '), number_format(memory_get_usage(true) / 1024 / 1024, 0, '.', ' '));
        // verbosef("%s MB, %s elements, %s pages", number_format($bytesRead / 1024 / 1024, 3), number_format($this->elementCount, 0, '.', ' '), number_format($this->pageCount, 0, '.', ' '));
        $lastDumpTime = time();
      }
      if (isset($this->_maxElements) && $this->elementCount >= $this->_maxElements) {
        verbose("Maxnodes reached, skipping rest of file");
        break;
      }
      if ($this->break) {
        verbose("Break set, skipping rest of file");
        break;
      }
    }

    xml_parser_free($parser);
    $this->parser = null;
  }

}


/*
  Simple console dump parser app
*/

class Application
{
  function removeComments($content)
  {
    return preg_replace("@<!--.*?-->\s*@", "", $content);
  }

  // "tpl-no-doc" page handler
  // $page['title'], $page['ns'], $page['content']
  // fills $this->record[] on positive match
  function nextPage_tplNoDoc($page)
  {
    // if ($page['ns'] != 0)
    //   return;
    if ($page['ns'] != 10)  // Template NS
      return;

    if (preg_match("@<noinclude>((?!\{\{(Dokumentace|Documentation)\}\}).)*?</noinclude>@isu", $page['content'], $m)) {
      vd($m, 'match');
      $this->record['matchCount'] = 1;
      $this->record['snippet'] = mb_substr($m[0], 64);
      // $this->record['...'] = ...;
    }

    // if (...) throw new Exception("break");
  }

  // @TODO add custom nextPage_... handlers

  function parserNextPage($page)
  {
    $title = $page['title'];
    $currentRev = reset($page['revisions']);
    $content = (isset($currentRev) && isset($currentRev['text']) ? $currentRev['text']['content'] : NULL);
    if (!$content) {
      verbose("Page '{$title}': no content");
    }
    $page['content'] = $content;

    $this->record = array();

    call_user_func($this->pageHandler, $page);

    if ($this->record) {
      // $this->record['recordType'] = ($this->record['recordType'] ? $this->record['recordType'] : 'page');
      //   'timeStamp', 'recordType', 'ns', 'title', 'pageid', 'revCount', 'timestamp', 'revid', 'old_revid', 'user', 'comment'),
      $this->record['ns'] = $page['ns'];
      $this->record['title'] = $page['title'];

      // @TODO log positive somewhere
      // DumpParserLogWriter::staticLogRecord($this->record);
      vd($page['title'], 'positive');

      $this->matchCount++;
      if ($this->maxMatchCount && $this->matchCount++ >= $this->maxMatchCount) {
        verbose("maxMatchCount reached: {$this->maxMatchCount}");
        $this->parser->break = true;
      }
    }
  }

  function parse()
  {
    $fileName = $this->dumpFileName;

    $this->parser = new WikiDumpLatestPagesXmlParser();

    $this->parser->pageHandler = array($this, "parserNextPage");

    $this->parser->open($fileName);
    try {
      $this->parser->parse();
    } catch (Exception $e) {
      if ($e->getMessage() == 'break') {
        verbose("Break");
      } else
        throw $e;
    }
    $this->parser->close();

    // if ($this->twoPhase) {
    //   $this->secondPhase();
    // }
  }

  function execute($actionName = NULL)
  {
    // PCRE limits
    ini_set('pcre.recursion_limit', max(10000000, ini_get('pcre.recursion_limit')));
    ini_set('pcre.backtrack_limit', max(10000000, ini_get('pcre.backtrack_limit')));

    // mbstring encoding
    mb_internal_encoding("utf-8");

    // Dump file name
    $this->dumpFileName = "dumps/cswiki-latest-pages-articles.xml";  // @TODO custom dump filename/location
    if (!file_exists($this->dumpFileName) && file_exists($this->dumpFileName . '.bz2'))
      $this->dumpFileName .= '.bz2';  // no extracted version, use archive directly

    $this->doParse = true;

    switch ($actionName) {

      case 'tplnodoc':
        $this->pageHandler = array($this, "nextPage_tplNoDoc");
        break;

      // @TODO add custom action handlers
      // case '...':
      //   $this->pageHandler = array($this, "nextPage_...");
      //   break;

      default:
        trigger_error("Unknown action: '{$actionName}'", E_USER_ERROR);
    }

    if ($this->doParse)
      $this->parse();

    if ($this->finalizeMethod) {
      call_user_func($this->finalizeMethod);
    }
  }

  function main($argv)
  {
    $this->execute(str_replace('-', '', $argv[1]));
  }
}

$application = new Application();
$application->main($GLOBALS['argv']);

?>