searpl

a small php sqlite search engine. <a href="https://thunix.net/~xfnw/search">see it in action</a>
Log | Files | Refs | README

crawl.php (1233B)


      1 <?php
      2 ini_set('display_errors', '1');
      3 ini_set('display_startup_errors', '1');
      4 error_reporting(E_ALL);
      5 
      6 $db = new PDO("sqlite:db.sqlite");
      7 
      8 
      9 
     10 $db->setAttribute(PDO::ATTR_ERRMODE, PDO::ERRMODE_WARNING );
     11 //$stmt = $db->prepare($sql);
     12 //$stmt->execute($params);
     13 
     14     function page_title($fp) {
     15         $res = preg_match("/<title>(.*)<\/title>/siU", $fp, $title_matches);
     16         if (!$res) 
     17             return null; 
     18 
     19         // Clean up title: remove EOL's and excessive whitespace.
     20         $title = preg_replace('/\s+/', ' ', $title_matches[1]);
     21         $title = trim($title);
     22         return $title;
     23     }
     24 
     25 
     26 
     27 $arg = $argv;
     28 array_shift($arg);
     29 
     30 foreach ($arg as $url) {
     31 	echo "\n";
     32 	$url = preg_replace('/\/$/','',$url);
     33 	echo $url."\n";
     34 
     35 	$stmt = $db->prepare('DELETE FROM indexed WHERE url = ?');
     36 	$stmt->execute([$url]);
     37 
     38 	$file = file_get_contents($url);
     39 	if (!$file)
     40 		continue;
     41 	$title = page_title($file);
     42 	$document = preg_replace('/[ \t]+/', ' ', preg_replace('/[\r\n]+/', "", strip_tags($file)));
     43 	if (!$title || !$document) {
     44 		echo "no title!\n";
     45 		continue;
     46 	}
     47 
     48 	echo "title: ".$title."\n";
     49 
     50 	$stmt = $db->prepare('INSERT INTO indexed (title, url, content) VALUES (?, ?, ?)');
     51 	$stmt->execute([$title, $url, $document]);
     52 }