Utente:Leonard Vertighel/temp2
More unfinished stuff (see temp)
Decode encoded angle bracket at list($c, $r) = preg_split('|</pre>|', $tempp, 2);
<? require_once('config.php'); require_once('functions.php'); $dbr = db_connect(); $dbw = db_connect(true); echo "Retrieving pages from db...\n"; $db_query = 'select page_title, old_text from it_page inner join text on page_latest = old_id where page_namespace = 0 and page_is_redirect = 0'; $result = mysql_unbuffered_query($db_query, $dbr); $counter = 0; echo "Starting external link extraction...\n"; while ( $row = mysql_fetch_assoc($result) ) { $query_parts = array(); $text = array(); $title = mysql_escape_string($row['page_title']); // do NOT decode char refs, since MW1.6a doesn't for ext links $old_text = $row['old_text']; // separate commented parts $temp = explode('<!--', $old_text); $text['c'] = ''; $rest = $temp[0]; unset($temp[0]); foreach ( $temp as $tempp ) { list($c, $r) = explode('-->', $tempp, 2); $text['c'] .= ' ' . $c; $rest .= ' ' . $r; } // separate nowiki $temp = preg_split('/<nowiki[^\n>]*>/iu', $rest); $text['t'] = ''; $rest = $temp[0]; unset($temp[0]); foreach ( $temp as $tempp ) { list($c, $r) = preg_split('|</nowiki>|', $tempp, 2); $text['t'] .= ' ' . $c; $rest .= ' ' . $r; } // separate pre $temp = preg_split('/<pre[^\n>]*>/iu', $rest); $rest = $temp[0]; unset($temp[0]); foreach ( $temp as $tempp ) { list($c, $r) = preg_split('|</pre>|', $tempp, 2); $text['t'] .= ' ' . $c; $rest .= ' ' . $r; } // separate "external links" section (assume only one) // capture '==' delimiter in order to split at next section of same or higher level // (assuming starting and ending delimiter to be balanced) $temp = preg_split('/(?:^|\n)(={2,6})\s*(?:collegamenti|collegamento|link|links|rinvio|rinvii)\s+(?:esterni|esterno)?\s*={2,6}/iu', $rest, 2, PREG_SPLIT_DELIM_CAPTURE); $text['a'] = $temp[0]; $rest = $temp[2]; if ( $rest ) { $hlevel = strlen($temp[1]); $temp = preg_split("/(^|\\n)={2,$hlevel}[^=\\n]/", $rest, 2); $text['s'] = $temp[0]; $text['a'] .= $temp[1]; } // finished splitting, now extract links from each part foreach ( $text as $type => $part ) { // not entirely accurate, since we might chop off // a trailing apostrophe. assume for now that // this case does not occur $part = preg_replace("/'{2,}/", ' ', $part); // To obtain a *more or less* accurate result, // try to "parse" the templates if ( $type != 't' ) { $temp = explode('{{', $part); $part = $temp[0]; unset($temp[0]); foreach ( $temp as $tempp ) { if ( strpos($tempp, '}}') ) { list($template, $rest) = explode('}}', $tempp, 2); // for now we are just interested in getting rid // of | attached to free links (just naively assuming // them to be all valid separators, period) $part .= ' ' . strtr($template, array('|' => ' ')) . ' ' . $rest; } else { // it seems we can't match this // so we put it back to where it came from $part .= '{{' . $tempp; } } } // for now we ignore malformed urls, including those // accidentally attached to precending word preg_match_all('/(?:(\[)|[^A-Za-z0-9]|^)(https?\:\/\/[^][<>"\s]+)([^]\n]*)(\]?)/', $part, $matches, PREG_SET_ORDER); foreach ( $matches as $match ) { // unset vars, since not all will always be set by extract unset($secure, $user, $pass, $host, $port, $path, $query, $fragment, $d_user, $d_pass, $d_host, $d_path, $d_query, $d_fragment, $domain, $tld, $linktext); list($url, $ltpart) = preg_split('/(?=&(lt|gt);)/', $match[2], 2); if ( $match[1] && $match[4] ) { // bracketed link $linktext = mysql_escape_string(trim($ltpart . $match[3])); if ( $linktext == '' ) { // ] does never appear in link text, so we use // it to mark bracketed links w/ no text $linktext = ']'; } } else { // remove trailing punctuation // (bracket cannot be at 0 position) // note that MW1.6 cuts off trailing \ $regexp = (strpos($url, '(')) ? '/[,;\\\\.:!?]*$/' : '/[,;\\\\.:!?)]*$/' ; $url = preg_replace($regexp, '', $url); // empty linktext implies non-bracketed link $linktext = ''; } // split url into parts: $url_parts = parse_url($url); if ( $url_parts === false ) { echo "Malformed URL '$url' found in $title\n"; continue; } extract(array_map(mysql_escape_string, $url_parts)); extract(array_map(mysql_escape_string, array_map(urldecode, $url_parts)), EXTR_PREFIX_ALL, 'd'); $secure = ( $scheme == 'http' ) ? 0 : 1; // mark ip addresses by empty tld if ( preg_match('/^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}$/', $d_host) ) { $tld = ''; $domain = $d_host; } else { $host_parts = explode('.', $d_host); $tld = array_pop($host_parts); $domain = array_pop($host_parts) . '.' . $tld; } if ( !$port ) { $port = 80; } $d_query = strtr($d_query, array('&' => '&')); $query_parts[] = "($secure, '$user', '$pass', '$host', $port, '$path', '$query', '$fragment', '$d_user', '$d_pass', '$d_host', '$d_path', '$d_query', '$d_fragment', '$domain', '$tld', '$type', '$linktext', '$title')"; } } // write urls to database if ( $link_count = count($query_parts) ) { $db_query = 'insert into extlinks (u_secure, u_user, u_pass, u_host, u_port, u_path, u_query, u_fragment, ud_user, ud_pass, ud_host, ud_path, ud_query, ud_fragment, ud_domain, ud_tld, t_type, t_link_text, t_page_title) values ' . implode($query_parts, ', '); mysql_query($db_query, $dbw); if ( $error = mysql_error($dbw) ) { echo "Received MySQL error:\n\n$error\n\nThe query was:\n\n$db_query\n\n"; exit; } } // page statistics // exclude comments and link section from page lenght $page_len = strlen($text['a']) + strlen($text['t']); $link_dens = round($link_count * 1000 / $page_len); $db_query = "insert into extl_stat (page_title, page_len, link_abs, link_rel) values ('$title', $page_len, $link_count, $link_dens)"; mysql_query($db_query, $dbw); if ( $error = mysql_error($dbw) ) { echo "Received MySQL error:\n\n$error\n\nThe query was:\n\n$db_query\n\n"; exit; } $counter++; if ( ($counter%1000) == 0 ) { echo $counter, "\n"; } } ?> <?php //extlink-check require_once('config.php'); require_once('functions.php'); $db = db_connect(); $start = $argv[1]; $limit = $argv[2]; $query = "select id, u_secure, u_host, u_port, u_path, u_query from extlinks where t_type = 'a' or t_type = 's' order by id limit $start, $limit"; $result = mysql_query($query, $db); while ( $row = mysql_fetch_assoc($result) ) { $id = $row['id']; $u_secure = $row['u_secure']; $u_host = $row['u_host']; $u_port = $row['u_port']; $u_path = $row['u_path']; $u_query = $row['u_query']; $url = $u_secure ? 'https://' : 'http://'; $url .= $u_host; $url .= ( $u_port != 80 ) ? ':' . $u_port : '' ; $url .= $u_path; $url .= ( $u_query ) ? '?' . $u_query : '' ; $url = strtr($url, array('&' => '&')); $ch = curl_init($url); curl_setopt($ch, CURLOPT_FOLLOWLOCATION, false); curl_setopt($ch, CURLOPT_HEADER, false); curl_setopt($ch, CURLOPT_NOBODY, true); curl_setopt($ch, CURLOPT_RETURNTRANSFER, true); curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false); curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, false); curl_setopt($ch, CURLOPT_TIMEOUT, 15); curl_exec($ch); $http_code = curl_getinfo($ch, CURLINFO_HTTP_CODE); $content_type = curl_getinfo($ch, CURLINFO_CONTENT_TYPE); if ( $content_type ) { list($content_type, $dummy) = explode(';', $content_type, 2); } else { $content_type = ''; } if ( curl_errno($ch) ) { $http_code = -1; $content_type = ''; } $time = time(); $content_type = mysql_escape_string($content_type); $u_host = mysql_escape_string($u_host); $u_path = mysql_escape_string($u_path); $u_query = mysql_escape_string($u_query); $query = "update extlinks set c_status = $http_code, c_time = $time, c_type = '$content_type' where id = $id"; mysql_query($query, $db); } ?> <?php // extlink-check-master $limit = 715; // IMPORTANT: adapt to database size! (should really automate this) for ( $i=0; $i < 100; $i++ ) { $start = $i * $limit; echo shell_exec("php /home/matteo/imago/extlink-check.php $start $limit > /dev/null 2>&1 &"); echo "$i started ($start, $limit)\n"; } ?>