crawl.php (1233B)
1 <?php 2 ini_set('display_errors', '1'); 3 ini_set('display_startup_errors', '1'); 4 error_reporting(E_ALL); 5 6 $db = new PDO("sqlite:db.sqlite"); 7 8 9 10 $db->setAttribute(PDO::ATTR_ERRMODE, PDO::ERRMODE_WARNING ); 11 //$stmt = $db->prepare($sql); 12 //$stmt->execute($params); 13 14 function page_title($fp) { 15 $res = preg_match("/<title>(.*)<\/title>/siU", $fp, $title_matches); 16 if (!$res) 17 return null; 18 19 // Clean up title: remove EOL's and excessive whitespace. 20 $title = preg_replace('/\s+/', ' ', $title_matches[1]); 21 $title = trim($title); 22 return $title; 23 } 24 25 26 27 $arg = $argv; 28 array_shift($arg); 29 30 foreach ($arg as $url) { 31 echo "\n"; 32 $url = preg_replace('/\/$/','',$url); 33 echo $url."\n"; 34 35 $stmt = $db->prepare('DELETE FROM indexed WHERE url = ?'); 36 $stmt->execute([$url]); 37 38 $file = file_get_contents($url); 39 if (!$file) 40 continue; 41 $title = page_title($file); 42 $document = preg_replace('/[ \t]+/', ' ', preg_replace('/[\r\n]+/', "", strip_tags($file))); 43 if (!$title || !$document) { 44 echo "no title!\n"; 45 continue; 46 } 47 48 echo "title: ".$title."\n"; 49 50 $stmt = $db->prepare('INSERT INTO indexed (title, url, content) VALUES (?, ?, ?)'); 51 $stmt->execute([$title, $url, $document]); 52 }