<?php
/*
dump-parser-sample.php
Wikipedia dump parsing example
Prerequisites/notes:
- CLI version of PHP installed with the following extensions available:
- pcre, mbstring, bz2, xml
- current latest-pages-articles dump from http://dumps.wikimedia.org/cswiki/latest/ present in:
- dumps/cswiki-latest-pages-articles.xml (or .xml.bz2)
- call using:
- php dump-parser-sample.php tpl-no-doc
- see @TODO tagged comments to customize/extend
Teslaton <http://sk.wikipedia.org/wiki/Redaktor:Teslaton>, 2009-2015
*/
error_reporting (E_ALL & ~E_NOTICE & ~E_STRICT);
/*
Some inlined library functions
*/
// verbose message
function verbose($msg, $title = NULL)
{
echo ($ATitle ? "{$title}: " : '') . $msg . "\n";
return $msg;
}
// verbose message fmt
function verbosef($formatStr, $p1 = NULL, $p2 = NULL, $p3 = NULL, $p4 = NULL, $p5 = NULL, $p6 = NULL, $p7 = NULL, $p8 = NULL, $p9 = NULL)
{
return verbose(sprintf($formatStr, $p1, $p2, $p3, $p4, $p5, $p6, $p7, $p8, $p9));
}
// debug dump of a variable
function vd($v, $title = NULL)
{
echo ($title ? "{$title}: " : '') . var_export($v, true) . "\n";
return $v;
}
// var dump and die
function vdd($v = NULL, $title = NULL)
{
vd($v, $title);
die(1);
}
/*
SAX wrapper
Handling low-level XML element/content callbacks, collecting XML-serialized
data and calling high-level page handler callback.
*/
class WikiDumpLatestPagesXmlParser
{
var $parser;
var $elementStack = array();
var $currentElement;
var $errorPrefix;
var $errors = array();
var $verbose = false;
var $pageHandler = null; // callback to call on page match
var $structs = array();
var $pageCount = 0;
var $break = false;
function error($message)
{
if (is_resource($this->parser))
$message = sprintf("%s (line %d, column %d)", $message, xml_get_current_line_number($this->parser), xml_get_current_column_number($this->parser));
$this->errors[] = $this->errorPrefix . $message;
// trigger_error($message, E_USER_ERROR);
}
function startElement($parser, $name, $attrs)
{
if ($this->_xmlDump) {
$attrDump = ""; foreach ($attrs as $n => $v) $attrDump .= ' ' . $n . '="' . $v . '"';
echo "\n" . str_repeat(" ", $this->elementDepth) . "<{$name}{$attrDump}>";
}
$parentElement = end($this->elementStack);
switch ($name) {
case 'page':
$this->structs['page'] = array();
break;
case 'revision':
$this->structs['revision'] = array();
break;
case 'contributor':
$this->structs['contributor'] = array();
break;
case 'text':
if (isset($this->structs[$parentElement]))
$this->structs[$parentElement][$name] = $attrs;
break;
case 'redirect':
if (isset($this->structs[$parentElement]))
$this->structs[$parentElement][$name] = $attrs;
break;
}
$this->elementCount++;
$this->elementDepth++;
$this->content = '';
$this->currentElement = $name;
array_push($this->elementStack, $name);
}
function endElement($parser, $name)
{
$this->elementDepth--;
if ($this->_xmlDump)
echo "\n" . str_repeat(" ", $this->elementDepth) . "</{$name}>";
$stackElement = array_pop($this->elementStack);
if ($stackElement != $name)
$this->error("Closing unexpected element: </{$stackElement}> expected, </{$name}> found.");
$parentElement = end($this->elementStack);
switch ($name) {
case 'title':
case 'ns':
case 'id':
case 'timestamp':
case 'comment':
case 'ip':
case 'username':
case 'parentid':
case 'minor':
if (isset($this->structs[$parentElement]))
$this->structs[$parentElement][$name] = $this->content;
break;
case 'redirect':
// nothing, attributes saved in startElement()
break;
case 'text':
// vdd($this->content);
if ($this->structs[$parentElement]['text'])
$this->structs[$parentElement]['text']['content'] = $this->content;
break;
case 'contributor':
$this->structs['revision']['contributor'] = $this->structs['contributor'];
break;
case 'revision':
if (!isset($this->structs['page']['revisionCount']))
$this->structs['page']['revisionCount'] = 0;
$this->structs['page']['revisionCount']++;
$isFirstRevision = !isset($this->structs['page']['revisions']);
if ($isFirstRevision) {
$this->structs['page']['revisions'] = array();
$this->structs['page']['revisions'][] = $this->structs['revision'];
} else {
$this->structs['page']['revisions'][1] = $this->structs['revision'];
}
break;
case 'page':
$this->pageCount++;
call_user_func($this->pageHandler, $this->structs['page']);
break;
default:
switch ($name) {
case 'model':
case 'format':
case 'sha1':
break;
default:
verbose("Unhandled </{$name}>");
}
break;
}
$this->content = '';
$this->currentElement = end($this->elementStack);
}
function characterData($parser, $data)
{
if ($this->_xmlDump && ($markup = trim($data)) !== '')
echo ($this->content === '' ? "\n" . str_repeat(" ", $this->elementDepth) : "") . $markup;
$this->content .= $data;
}
function open($fileName)
{
$this->fileName = $fileName;
$this->fileNameWrap =
(preg_match("@\.gz$@", $this->fileName) ? "compress.zlib://" . $this->fileName :
(preg_match("@\.bz2$@", $this->fileName) ? "compress.bzip2://" . $this->fileName :
$this->fileName));
$this->fileHandle = fopen($this->fileNameWrap, 'r');
if (!$this->fileHandle)
throw new Exception("Failed to open XML dump {$fileName}");
}
function close()
{
if ($this->fileHandle) {
fclose($this->fileHandle);
$this->fileHandle = null;
}
if ($this->parser) {
xml_parser_free($this->parser);
$this->parser = null;
}
}
function parse()
{
$this->parser = $parser = xml_parser_create();
xml_parser_set_option($parser, XML_OPTION_CASE_FOLDING, 0);
xml_set_element_handler($parser, array($this, "startElement"), array($this, "endElement"));
xml_set_character_data_handler($parser, array($this, "characterData"));
$this->break = false;
$this->_maxElements = null;
// $this->_maxElements = 1000000;
$this->_xmlDump = false;
// $this->_xmlDump = true;
$bytesRead = 0;
while (($data = fread($this->fileHandle, 16*1024)) !== false && $data !== '') {
$bytesRead += strlen($data);
if (!xml_parse($parser, $data, feof($this->fileHandle))) {
throw new Exception(sprintf("XML error: %s at line %d",
xml_error_string(xml_get_error_code($parser)),
xml_get_current_line_number($parser)));
}
// verbosef("%s MB, %s elements", number_format($bytesRead / 1024 / 1024, 3), number_format($this->elementCount, 0, '.', ' '));
if (abs(time() - $lastDumpTime) >= 1) {
verbosef("%s MB, %s elements, %s pages, %s MB RAM", number_format($bytesRead / 1024 / 1024, 3), number_format($this->elementCount, 0, '.', ' '), number_format($this->pageCount, 0, '.', ' '), number_format(memory_get_usage(true) / 1024 / 1024, 0, '.', ' '));
// verbosef("%s MB, %s elements, %s pages", number_format($bytesRead / 1024 / 1024, 3), number_format($this->elementCount, 0, '.', ' '), number_format($this->pageCount, 0, '.', ' '));
$lastDumpTime = time();
}
if (isset($this->_maxElements) && $this->elementCount >= $this->_maxElements) {
verbose("Maxnodes reached, skipping rest of file");
break;
}
if ($this->break) {
verbose("Break set, skipping rest of file");
break;
}
}
xml_parser_free($parser);
$this->parser = null;
}
}
/*
Simple console dump parser app
*/
class Application
{
function removeComments($content)
{
return preg_replace("@<!--.*?-->\s*@", "", $content);
}
// "tpl-no-doc" page handler
// $page['title'], $page['ns'], $page['content']
// fills $this->record[] on positive match
function nextPage_tplNoDoc($page)
{
// if ($page['ns'] != 0)
// return;
if ($page['ns'] != 10) // Template NS
return;
if (preg_match("@<noinclude>((?!\{\{(Dokumentace|Documentation)\}\}).)*?</noinclude>@isu", $page['content'], $m)) {
vd($m, 'match');
$this->record['matchCount'] = 1;
$this->record['snippet'] = mb_substr($m[0], 64);
// $this->record['...'] = ...;
}
// if (...) throw new Exception("break");
}
// @TODO add custom nextPage_... handlers
function parserNextPage($page)
{
$title = $page['title'];
$currentRev = reset($page['revisions']);
$content = (isset($currentRev) && isset($currentRev['text']) ? $currentRev['text']['content'] : NULL);
if (!$content) {
verbose("Page '{$title}': no content");
}
$page['content'] = $content;
$this->record = array();
call_user_func($this->pageHandler, $page);
if ($this->record) {
// $this->record['recordType'] = ($this->record['recordType'] ? $this->record['recordType'] : 'page');
// 'timeStamp', 'recordType', 'ns', 'title', 'pageid', 'revCount', 'timestamp', 'revid', 'old_revid', 'user', 'comment'),
$this->record['ns'] = $page['ns'];
$this->record['title'] = $page['title'];
// @TODO log positive somewhere
// DumpParserLogWriter::staticLogRecord($this->record);
vd($page['title'], 'positive');
$this->matchCount++;
if ($this->maxMatchCount && $this->matchCount++ >= $this->maxMatchCount) {
verbose("maxMatchCount reached: {$this->maxMatchCount}");
$this->parser->break = true;
}
}
}
function parse()
{
$fileName = $this->dumpFileName;
$this->parser = new WikiDumpLatestPagesXmlParser();
$this->parser->pageHandler = array($this, "parserNextPage");
$this->parser->open($fileName);
try {
$this->parser->parse();
} catch (Exception $e) {
if ($e->getMessage() == 'break') {
verbose("Break");
} else
throw $e;
}
$this->parser->close();
// if ($this->twoPhase) {
// $this->secondPhase();
// }
}
function execute($actionName = NULL)
{
// PCRE limits
ini_set('pcre.recursion_limit', max(10000000, ini_get('pcre.recursion_limit')));
ini_set('pcre.backtrack_limit', max(10000000, ini_get('pcre.backtrack_limit')));
// mbstring encoding
mb_internal_encoding("utf-8");
// Dump file name
$this->dumpFileName = "dumps/cswiki-latest-pages-articles.xml"; // @TODO custom dump filename/location
if (!file_exists($this->dumpFileName) && file_exists($this->dumpFileName . '.bz2'))
$this->dumpFileName .= '.bz2'; // no extracted version, use archive directly
$this->doParse = true;
switch ($actionName) {
case 'tplnodoc':
$this->pageHandler = array($this, "nextPage_tplNoDoc");
break;
// @TODO add custom action handlers
// case '...':
// $this->pageHandler = array($this, "nextPage_...");
// break;
default:
trigger_error("Unknown action: '{$actionName}'", E_USER_ERROR);
}
if ($this->doParse)
$this->parse();
if ($this->finalizeMethod) {
call_user_func($this->finalizeMethod);
}
}
function main($argv)
{
$this->execute(str_replace('-', '', $argv[1]));
}
}
$application = new Application();
$application->main($GLOBALS['argv']);
?>