More unfinished stuff (see temp)

Decode encoded angle bracket at list($c, $r) = preg_split('|</pre>|', $tempp, 2);

<?
require_once('config.php');
require_once('functions.php');

$dbr = db_connect();
$dbw = db_connect(true);

echo "Retrieving pages from db...\n";
$db_query = 'select page_title, old_text from it_page inner join text on page_latest = old_id where page_namespace = 0 and page_is_redirect = 0';
$result = mysql_unbuffered_query($db_query, $dbr);
$counter = 0;
echo "Starting external link extraction...\n";
while ( $row = mysql_fetch_assoc($result) ) {
	$query_parts = array();
	$text = array();
	$title = mysql_escape_string($row['page_title']);
	// do NOT decode char refs, since MW1.6a doesn't for ext links
	$old_text = $row['old_text'];
	// separate commented parts
	$temp = explode('<!--', $old_text);
	$text['c'] = '';
	$rest = $temp[0];
	unset($temp[0]);
	foreach ( $temp as $tempp ) {
		list($c, $r) = explode('-->', $tempp, 2);
		$text['c'] .= ' ' . $c;
		$rest .= ' ' . $r;
	}
	// separate nowiki
	$temp = preg_split('/<nowiki[^\n>]*>/iu', $rest);
	$text['t'] = '';
	$rest = $temp[0];
	unset($temp[0]);
	foreach ( $temp as $tempp ) {
		list($c, $r) = preg_split('|</nowiki>|', $tempp, 2);
		$text['t'] .= ' ' . $c;
		$rest .= ' ' . $r;
	}
	// separate pre
	$temp = preg_split('/<pre[^\n>]*>/iu', $rest);
	$rest = $temp[0];
	unset($temp[0]);
	foreach ( $temp as $tempp ) {
		list($c, $r) = preg_split('|</pre>|', $tempp, 2);
		$text['t'] .= ' ' . $c;
		$rest .= ' ' . $r;
	}
	// separate "external links" section (assume only one)
	// capture '==' delimiter in order to split at next section of same or higher level
	// (assuming starting and ending delimiter to be balanced)
	$temp = preg_split('/(?:^|\n)(={2,6})\s*(?:collegamenti|collegamento|link|links|rinvio|rinvii)\s+(?:esterni|esterno)?\s*={2,6}/iu', $rest, 2, PREG_SPLIT_DELIM_CAPTURE);
	$text['a'] = $temp[0];
	$rest = $temp[2];
	if ( $rest ) {
		$hlevel = strlen($temp[1]);
		$temp = preg_split("/(^|\\n)={2,$hlevel}[^=\\n]/", $rest, 2);
		$text['s'] = $temp[0];
		$text['a'] .= $temp[1];
	}
	// finished splitting, now extract links from each part
	foreach ( $text as $type => $part ) {
		// not entirely accurate, since we might chop off
		// a trailing apostrophe. assume for now that
		// this case does not occur
		$part = preg_replace("/'{2,}/", ' ', $part);
		// To obtain a *more or less* accurate result,
		// try to "parse" the templates
		if ( $type != 't' ) {
			$temp = explode('{{', $part);
			$part = $temp[0];
			unset($temp[0]);
			foreach ( $temp as $tempp ) {
				if ( strpos($tempp, '}}') ) {
					list($template, $rest) = explode('}}', $tempp, 2);
					// for now we are just interested in getting rid
					// of | attached to free links (just naively assuming
					// them to be all valid separators, period)
					$part .= ' ' . strtr($template, array('|' => ' ')) . ' ' . $rest;
				} else {
					// it seems we can't match this
					// so we put it back to where it came from
					$part .= '{{' . $tempp;
				}
			}
		}
		// for now we ignore malformed urls, including those
		// accidentally attached to precending word
		preg_match_all('/(?:(\[)|[^A-Za-z0-9]|^)(https?\:\/\/[^][<>"\s]+)([^]\n]*)(\]?)/', $part, $matches, PREG_SET_ORDER);
		foreach ( $matches as $match ) {
			// unset vars, since not all will always be set by extract
			unset($secure, $user, $pass, $host, $port, $path, $query, $fragment, $d_user, $d_pass, $d_host, $d_path, $d_query, $d_fragment, $domain, $tld, $linktext);
			list($url, $ltpart) = preg_split('/(?=&(lt|gt);)/', $match[2], 2);
			if ( $match[1] && $match[4] ) {
				// bracketed link
				$linktext = mysql_escape_string(trim($ltpart . $match[3]));
				if ( $linktext == '' ) {
					// ] does never appear in link text, so we use
					// it to mark bracketed links w/ no text
					$linktext = ']';
				}
			} else {
				// remove trailing punctuation
				// (bracket cannot be at 0 position)
				// note that MW1.6 cuts off trailing \
				$regexp = (strpos($url, '(')) ? '/[,;\\\\.:!?]*$/' : '/[,;\\\\.:!?)]*$/' ;
				$url = preg_replace($regexp, '', $url);
				// empty linktext implies non-bracketed link
				$linktext = '';
			}
			// split url into parts:
			$url_parts = parse_url($url);
			if ( $url_parts === false ) {
				echo "Malformed URL '$url' found in $title\n";
				continue;
			}
			extract(array_map(mysql_escape_string, $url_parts));
			extract(array_map(mysql_escape_string, array_map(urldecode, $url_parts)), EXTR_PREFIX_ALL, 'd');
			$secure = ( $scheme == 'http' ) ? 0 : 1;
			// mark ip addresses by empty tld
			if ( preg_match('/^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}$/', $d_host) ) {
				$tld = '';
				$domain = $d_host;
			} else {
				$host_parts = explode('.', $d_host);
				$tld = array_pop($host_parts);
				$domain = array_pop($host_parts) . '.' . $tld;
			}
			if ( !$port ) {
				$port = 80;
			}
			$d_query = strtr($d_query, array('&' => '&'));
			$query_parts[] = "($secure, '$user', '$pass', '$host', $port, '$path', '$query', '$fragment', '$d_user', '$d_pass', '$d_host', '$d_path', '$d_query', '$d_fragment', '$domain', '$tld', '$type', '$linktext', '$title')";
		}
	}
	// write urls to database
	if ( $link_count = count($query_parts) ) {
		$db_query = 'insert into extlinks (u_secure, u_user, u_pass, u_host, u_port, u_path, u_query, u_fragment, ud_user, ud_pass, ud_host, ud_path, ud_query, ud_fragment, ud_domain, ud_tld, t_type, t_link_text, t_page_title) values ' . implode($query_parts, ', ');
		mysql_query($db_query, $dbw);
		if ( $error = mysql_error($dbw) ) {
			echo "Received MySQL error:\n\n$error\n\nThe query was:\n\n$db_query\n\n";
			exit;
		}
	}
	// page statistics
	// exclude comments and link section from page lenght
	$page_len = strlen($text['a']) + strlen($text['t']);
	$link_dens = round($link_count * 1000 / $page_len);
	$db_query = "insert into extl_stat (page_title, page_len, link_abs, link_rel) values ('$title', $page_len, $link_count, $link_dens)";
	mysql_query($db_query, $dbw);
	if ( $error = mysql_error($dbw) ) {
		echo "Received MySQL error:\n\n$error\n\nThe query was:\n\n$db_query\n\n";
		exit;
	}
	
	$counter++;
	if ( ($counter%1000) == 0 ) {
		echo $counter, "\n";
	}
}
?>

<?php //extlink-check
require_once('config.php');
require_once('functions.php');

$db = db_connect();

$start = $argv[1];
$limit = $argv[2];

$query = "select id, u_secure, u_host, u_port, u_path, u_query from extlinks where t_type = 'a' or t_type = 's' order by id limit $start, $limit";
$result = mysql_query($query, $db);
while ( $row = mysql_fetch_assoc($result) ) {
	$id = $row['id'];
	$u_secure = $row['u_secure'];
	$u_host = $row['u_host'];
	$u_port = $row['u_port'];
	$u_path = $row['u_path'];
	$u_query = $row['u_query'];
	$url = $u_secure ? 'https://' : 'http://';
	$url .= $u_host;
	$url .= ( $u_port != 80 ) ? ':' . $u_port : '' ;
	$url .= $u_path;
	$url .= ( $u_query ) ? '?' . $u_query : '' ;
	$url = strtr($url, array('&' => '&'));
	$ch = curl_init($url);
	curl_setopt($ch, CURLOPT_FOLLOWLOCATION, false);
	curl_setopt($ch, CURLOPT_HEADER, false);
	curl_setopt($ch, CURLOPT_NOBODY, true);
	curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
	curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false);
	curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, false);
	curl_setopt($ch, CURLOPT_TIMEOUT, 15);
	curl_exec($ch);
	$http_code = curl_getinfo($ch, CURLINFO_HTTP_CODE);
	$content_type = curl_getinfo($ch, CURLINFO_CONTENT_TYPE);
	if ( $content_type ) {
		list($content_type, $dummy) = explode(';', $content_type, 2);
	} else {
		$content_type = '';
	}
	if ( curl_errno($ch) ) {
		$http_code = -1;
		$content_type = '';
	}
	$time = time();
	$content_type = mysql_escape_string($content_type);
	$u_host = mysql_escape_string($u_host);
	$u_path = mysql_escape_string($u_path);
	$u_query = mysql_escape_string($u_query);
	$query = "update extlinks set c_status = $http_code, c_time = $time, c_type = '$content_type' where id = $id";
	mysql_query($query, $db); 
}
?>


<?php // extlink-check-master
$limit = 715; // IMPORTANT: adapt to database size! (should really automate this)
for ( $i=0; $i < 100; $i++ ) {
	$start = $i * $limit;
	echo shell_exec("php /home/matteo/imago/extlink-check.php $start $limit > /dev/null 2>&1 &");
	echo "$i started ($start, $limit)\n";
}
?>