106 lines
2.3 KiB
Perl
106 lines
2.3 KiB
Perl
#!/usr/bin/perl
|
|
use strict;
|
|
use WWW::Mechanize;
|
|
use DBI;
|
|
|
|
our ($M_C_DEEP, $M_C_WHITELIST, $M_C_BLACKLIST, $M_C_LINKMAXAGE);
|
|
our ($M_DB_HOST, $M_DB_USER, $M_DB_PASS);
|
|
|
|
require "config.crawl.pl";
|
|
require "config.global.pl";
|
|
|
|
$ENV{PERL_LWP_SSL_VERIFY_HOSTNAME} = 0;
|
|
|
|
open FH_WHITELIST, "$M_C_WHITELIST";
|
|
my @A_WHITELIST = <FH_WHITELIST>;
|
|
close FH_WHITELIST;
|
|
|
|
open FH_BLACKLIST, "$M_C_BLACKLIST";
|
|
my @A_BLACKLIST = <FH_BLACKLIST>;
|
|
close FH_BLACKLIST;
|
|
|
|
foreach my $link (@A_WHITELIST) {
|
|
chomp $link;
|
|
read_link($link, $M_C_DEEP) unless (in_blacklist($link));
|
|
}
|
|
|
|
|
|
sub read_link {
|
|
my $link = shift;
|
|
my $deep = shift;
|
|
# rekursiv $link durchlaufen
|
|
my @links = fetch_links($link);
|
|
foreach my $url (@links) {
|
|
read_url_to_db($url, $deep) unless (in_blacklist($url));
|
|
if ($deep > 0) {
|
|
$deep--;
|
|
read_link($url, $deep) unless (in_blacklist($url));
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
sub fetch_links {
|
|
# lese alle links von einer url
|
|
my $link = shift;
|
|
my $mech = WWW::Mechanize->new(onerror => undef);
|
|
$mech->get($link);
|
|
my @ret = $mech->links();
|
|
my @links;
|
|
foreach my $l (@ret) {
|
|
my $u = $l->url;
|
|
$u = $link . $u unless ($u =~ /^http/);
|
|
push @links,$u;
|
|
}
|
|
return @links;
|
|
}
|
|
|
|
|
|
sub in_blacklist {
|
|
my $link = shift;
|
|
my @blacklist = shift;
|
|
my $found=0;
|
|
foreach my $black (@A_BLACKLIST) {
|
|
chomp $black;
|
|
if ($link =~ /$black/) {
|
|
# link ist in blacklist
|
|
return 1;
|
|
}
|
|
}
|
|
# link ist nicht in blacklist\n";
|
|
return 0;
|
|
}
|
|
|
|
|
|
sub read_url_to_db {
|
|
# lese titel, tags aus webseite und schreibe diese mit der url und der tiefein die db
|
|
my $link = shift;
|
|
my $deep = shift;
|
|
my $timestamp = time();
|
|
|
|
my $dbh = DBI->connect('DBI:mysql:1_mose', '1_mose', '1_mose') || die "Could not connect to database: $DBI::errstr";
|
|
# wenn $link schon in db: update
|
|
my $sth = $dbh->prepare("SELECT count(*) FROM mose WHERE link='$link'");
|
|
$sth->execute();
|
|
my @result = $sth->fetchrow_array();
|
|
$sth->finish();
|
|
|
|
# link deep title timestamp
|
|
# lese titel von seite
|
|
my $mech = WWW::Mechanize->new(onerror => undef);
|
|
$mech->get($link);
|
|
my $title = $mech->title(onerror => undef);
|
|
|
|
if ($result[0]) { # gefunden
|
|
}
|
|
else { # nicht gefunden
|
|
print "'$title', '$link', '$deep', '$timestamp'\n";
|
|
$sth = $dbh->prepare("insert into mose (title, link, deep, timestamp) values ('$title', '$link', '$deep', '$timestamp')");
|
|
$sth->execute();
|
|
$sth->finish();
|
|
}
|
|
|
|
$dbh->disconnect();
|
|
}
|
|
|