diff --git a/senicup.pl b/senicup.pl index 7a0618c..e54e262 100755 --- a/senicup.pl +++ b/senicup.pl @@ -3,11 +3,6 @@ # Multi-UA web scraper written in Perl. Based on scanning the non-private IPv4 network in reverse order and requesting rDNS if a web-related port is open. # (C) Roy van Lunsen -# TODO: Add DB support for storing HTMLs, b64 screenshots, etc.. -# TODO: Utilize the DB for randomizing the scanning of IPv4 blocks supporting continuation. -# TODO: Add RE-based querying system for scoped IPv4/domain scrapes. -# TODO: Reliably detect when being blocked by companies that own (too) much network space and delay requests appropriately (might not be necessary if various parts are sufficiently randomized). - use strict; use warnings; use utf8; @@ -43,18 +38,18 @@ $driver->set_timeout('implicit', 20000); $driver->set_timeout('page load', 15000); my $p = Net::Ping->new("syn", 3); -sub interrupt { -if ($#_ == 0) { +sub INTERRUPT { +if (exists($args{'host'})) { open my $TMP_TMP, '>', './lastip'; -print $TMP_TMP $_[0], "\n"; +print $TMP_TMP $args{'host'}, "\n"; close $TMP_TMP; -} else {print "no args", "\n"} +} else {print "no args", "\n\n\n\n\n\n\n\n\n\n\n"} $p->close(); $driver->shutdown_binary; +exit 0; } -$SIG{'INT'} = 'interrupt($args{"host"})'; -$SIG{'HUP'} = 'interrupt($args{"host"})'; +$SIG{'INT'} = 'INTERRUPT'; sub connectivity_check { chomp(my $connectivity = `ip a | grep -A 2 -Ei '^[0-9]+: wl[^:]+:' | grep -E '\\s+?inet\\b' | sed -E 's/^\\s+?inet\\s+?([^/]+).*\$/\\1/'`); # Will do for now. @@ -72,8 +67,9 @@ my $l2 = substr($digest, 1, 1); my $l3 = substr($digest, 2, 1); my $l4 = substr($digest, 3, 1); my $datadir = './data/'."$l1/$l2/$l3/$l4/$digest/"; -make_path($datadir) or return 10; +make_path($datadir) or (print 'Skipping: Already visited URL ', $_[0], ' before.', "\n" and return 0); &connectivity_check; +print 'Navigating to ', "http://$_[0]\n"; $driver->get("http://$_[0]"); open my $FILEH_A, '>:encoding(UTF-8)', $datadir.'src.html'; open my $FILEH_B, '>', $datadir.'url.txt'; @@ -81,35 +77,43 @@ my $FILEH_C; open my $FILEH_D, '>', $datadir.'src_hea.txt'; # User-agent headers. open my $FILEH_E, '>', $datadir.'src2_hes.txt'; # Server headers (only applicable for Curl). open my $FILEH_F, '>', $datadir.'src2_hea.txt'; +print 'Writing Selenium-rendered page source to ', $datadir.'src.html', "\n"; eval {print $FILEH_A $driver->get_page_source("http://$_[0]")}; &connectivity_check($datadir); +print 'Writing Selenium screenshot to ', $datadir.'scs.png', "\n"; eval {$driver->capture_screenshot($datadir.'scs.png', {'full' => 1})}; $easy->setopt(CURLOPT_URL, "http://$_[0]"); if (-z $datadir.'scs.png') { # A zero-length screenshot happens with full-screen images. Assume non-html MIME. -open $FILEH_C, '>', $datadir.'scs.png'} else {open $FILEH_C, '>', $datadir.'src2.html'} +print 'Writing non-HTML Curl-rendered page source to ', $datadir.'scs.png', "\n"; open $FILEH_C, '>', $datadir.'scs.png'} else {print 'Writing Curl-rendered page source to ', $datadir.'src2.html', "\n"; open $FILEH_C, '>', $datadir.'src2.html'} $easy->setopt(CURLOPT_FILE, $FILEH_C); +print 'Writing server headers to ', $datadir.'src2_hes.txt', "\n"; $easy->setopt(CURLOPT_HEADERDATA, $FILEH_E); eval {$easy->perform()}; print $FILEH_B $_[1]; # First, print the ip. print $FILEH_B "\n"; eval {print $FILEH_B $driver->get_current_url()}; # Then print the (redirected to) browser location. +print 'Got redirected to ', $driver->get_current_url(), "\n" if "http://$_[0]" ne $driver->get_current_url(); print $FILEH_B "\n"; print $FILEH_B 'http://', $_[0]; # Lastly, print visited domain/ip. print $FILEH_B "\n"; +print 'Writing Selenium UA headers to ', $datadir.'src_hea.txt', "\n"; eval {print $FILEH_D $driver->get_user_agent()}; print $FILEH_D "\n"; +print 'Writing Curl UA headers to ', $datadir.'src2_hea.txt', "\n"; print $FILEH_F $easy_ua, "\n"; } sub get_443_src { +print 'i am retarded', "\n"; my $digest = sha1_hex($_[0]); # Use the sha1sum of the domain (if unavailable ip) (w/o protocol prefix) to build directories. my $l1 = substr($digest, 0, 1); my $l2 = substr($digest, 1, 1); my $l3 = substr($digest, 2, 1); my $l4 = substr($digest, 3, 1); my $datadir = './data/'."$l1/$l2/$l3/$l4/$digest/"; -make_path($datadir) or return 10; +make_path($datadir) or (print 'Skipping: Already visited URL ', $_[0], ' before.', "\n" and return 0); &connectivity_check; +print 'Navigating to ', "https://$_[0]\n"; $driver->get("https://$_[0]"); open my $FILEH_A, '>:encoding(UTF-8)', $datadir.'src.html'; open my $FILEH_B, '>', $datadir.'url.txt'; @@ -117,23 +121,29 @@ my $FILEH_C; open my $FILEH_D, '>', $datadir.'src_hea.txt'; # User-agent headers. open my $FILEH_E, '>', $datadir.'src2_hes.txt'; # Server headers (only applicable for Curl). open my $FILEH_F, '>', $datadir.'src2_hea.txt'; +print 'Writing Selenium-rendered page source to ', $datadir.'src.html', "\n"; eval {print $FILEH_A $driver->get_page_source("https://$_[0]")}; &connectivity_check($datadir); +print 'Writing Selenium screenshot to ', $datadir.'scs.png', "\n"; eval {$driver->capture_screenshot($datadir.'scs.png', {'full' => 1})}; $easy->setopt(CURLOPT_URL, "https://$_[0]"); if (-z $datadir.'scs.png') { # A zero-length screenshot happens with full-screen images. Assume non-html MIME. -open $FILEH_C, '>', $datadir.'scs.png'} else {open $FILEH_C, '>', $datadir.'src2.html'} +print 'Writing non-HTML Curl-rendered page source to ', $datadir.'scs.png', "\n"; open $FILEH_C, '>', $datadir.'scs.png'} else {print 'Writing Curl-rendered page source to ', $datadir.'src2.html', "\n"; open $FILEH_C, '>', $datadir.'src2.html'} $easy->setopt(CURLOPT_FILE, $FILEH_C); +print 'Writing server headers to ', $datadir.'src2_hes.txt', "\n"; $easy->setopt(CURLOPT_HEADERDATA, $FILEH_E); eval {$easy->perform()}; print $FILEH_B $_[1]; # First, print the ip. print $FILEH_B "\n"; eval {print $FILEH_B $driver->get_current_url()}; # Then print the (redirected to) browser location. +print 'Got redirected to ', $driver->get_current_url(), "\n" if "https://$_[0]" ne $driver->get_current_url(); print $FILEH_B "\n"; print $FILEH_B 'https://', $_[0]; # Lastly, print visited domain/ip. print $FILEH_B "\n"; +print 'Writing Selenium UA headers to ', $datadir.'src_hea.txt', "\n"; eval {print $FILEH_D $driver->get_user_agent()}; print $FILEH_D "\n"; +print 'Writing Curl UA headers to ', $datadir.'src2_hea.txt', "\n"; print $FILEH_F $easy_ua, "\n"; } @@ -147,6 +157,7 @@ udp_timeout => 10 my $reply = $res->search("$_[0]", "PTR"); if ($reply) { foreach my $rr (grep { $_->type eq "PTR" } $reply->answer) { # Do not assume rr-objects are of the same type as requested (use grep). +print 'Got ', $rr->ptrdname, "\n"; push @obj, $rr->ptrdname; } } @@ -161,20 +172,20 @@ $p->ping($host); (my $tmphost = $host) =~ s/[.][0-9.]+$//; $host[$tmphost] = $host; } -while (my ($host) = $p->ack) {push @{$hoa{$_[0]}}, $host; $host =~ s/[.][0-9.]+$//; splice @host, $host} +while (my ($host) = $p->ack) {print 'Queueing ', $host, ' because of ', $_[1], '-ACK on port ', $_[0], "\n"; push @{$hoa{$_[0]}}, $host; $host =~ s/[.][0-9.]+$//; splice @host, $host} } $initvar=1; unless (defined($continue_from)) {($ii1, $ii2, $ii3, $ii4) = (1, 0, 0, 0)} -LABEL4: foreach my $i4 (0..254) { +LABEL4: foreach my $i4 (0..255) { if ($initvar == 1) { until ($i4 >= $ii4) {next LABEL4} } -LABEL3: foreach my $i3 (0..254) { +LABEL3: foreach my $i3 (0..255) { if ($initvar == 1) { until (int($i4*255**3+$i3*255**2) >= int($ii4*255**3+$ii3*255**2)) {next LABEL3} } -LABEL2: foreach my $i2 (0..254) { +LABEL2: foreach my $i2 (0..255) { if ($initvar == 1) { until (int($i4*255**3+$i3*255**2+$i2*255) >= int($ii4*255**3+$ii3*255**2+$ii2*255)) {next LABEL2} } @@ -188,13 +199,13 @@ undef %hoa; my $host_end = $i2.'.'.$i3.'.'.$i4; $args{'port'} = '80'; $args{'proto'} = 'tcp'; $p->port_number($args{'port'}); -foreach my $i1 ($iii1..9,$iii2..126,$iii3..254) { # Skip large, private ipv4 blocks. +foreach my $i1 ($iii1..9,$iii2..126,$iii3..255) { # Skip large, private ipv4 blocks. $args{'host'} = $i1.'.'.$host_end; $p->ping($args{'host'}); $host[$i1] = $args{'host'}; } &connectivity_check; -while (my ($host) = $p->ack) {push @{$hoa{'80'}}, $host; $host =~ s/[.][0-9.]+$//; splice @host, $host} +while (my ($host) = $p->ack) {print 'Queueing ', $host, ' because of ', $args{'proto'}, '-ACK on port ', $args{'port'}, "\n"; push @{$hoa{'80'}}, $host; $host =~ s/[.][0-9.]+$//; splice @host, $host} &connectivity_check; &syn_ping_elmn(80, 'udp'); @@ -204,19 +215,21 @@ while (my ($host) = $p->ack) {push @{$hoa{'80'}}, $host; $host =~ s/[.][0-9.]+$/ foreach my $host (@{$hoa{'80'}}) { next unless defined($host); utime time, time, './lastip'; + print 'Looking up PTR-records for ', $host, "\n"; my @rray = &reverse_dns_doms($host); if ($#rray == 0 and $rray[0] eq '') {eval {&get_80_src($host, $host)}} else { foreach (@rray) { sleep rand(1)/(rand(10)+1); - eval {&get_80_src($_, $host)}}}} + eval {&get_80_src($_, $host)}}}} # Catch non-fatal browser errors. foreach my $host (@{$hoa{'443'}}) { next unless defined($host); utime time, time, './lastip'; + print 'Looking up PTR-records for ', $host, "\n"; my @rray = &reverse_dns_doms($host); - if ($#rray == 0 and $rray[0] eq '') {eval {&get_443_src($host, $host)}} else { + if ($#rray == 0 and $rray[0] eq '') {&get_443_src($host, $host)} else { foreach (@rray) { sleep rand(1)/(rand(10)+1); - eval {&get_443_src($_, $host)}}}} + eval {&get_443_src($_, $host)}}}} # Catch non-fatal browser errors. utime time, time, './lastip'; &connectivity_check; $p->close();