@ -3,11 +3,6 @@
# Multi-UA web scraper written in Perl. Based on scanning the non-private IPv4 network in reverse order and requesting rDNS if a web-related port is open.
# (C) Roy van Lunsen
# TODO: Add DB support for storing HTMLs, b64 screenshots, etc..
# TODO: Utilize the DB for randomizing the scanning of IPv4 blocks supporting continuation.
# TODO: Add RE-based querying system for scoped IPv4/domain scrapes.
# TODO: Reliably detect when being blocked by companies that own (too) much network space and delay requests appropriately (might not be necessary if various parts are sufficiently randomized).
use strict ;
use warnings ;
use utf8 ;
@ -43,18 +38,18 @@ $driver->set_timeout('implicit', 20000);
$ driver - > set_timeout ( 'page load' , 15000 ) ;
my $ p = Net::Ping - > new ( "syn" , 3 ) ;
sub interrupt {
if ( $# _ == 0 ) {
sub INTERRUPT {
if ( exists ( $ args { 'host' } ) ) {
open my $ TMP_TMP , '>' , './lastip' ;
print $ TMP_TMP $ _ [ 0 ] , "\n" ;
print $ TMP_TMP $ args { 'host' } , "\n" ;
close $ TMP_TMP ;
} else { print "no args" , "\n" }
} else { print "no args" , "\n\n\n\n\n\n\n\n\n\n\n " }
$ p - > close ( ) ;
$ driver - > shutdown_binary ;
exit 0 ;
}
$ SIG { 'INT' } = 'interrupt($args{"host"})' ;
$ SIG { 'HUP' } = 'interrupt($args{"host"})' ;
$ SIG { 'INT' } = 'INTERRUPT' ;
sub connectivity_check {
chomp ( my $ connectivity = `ip a | grep -A 2 -Ei '^[0-9]+: wl[^:]+:' | grep -E '\\s+?inet\\b' | sed -E 's/^\\s+?inet\\s+?([^/]+).*\$/\\1/'` ) ; # Will do for now.
@ -72,8 +67,9 @@ my $l2 = substr($digest, 1, 1);
my $ l3 = substr ( $ digest , 2 , 1 ) ;
my $ l4 = substr ( $ digest , 3 , 1 ) ;
my $ datadir = './data/' . "$l1/$l2/$l3/$l4/$digest/" ;
make_path ( $ datadir ) or return 1 0;
make_path ( $ datadir ) or ( print 'Skipping: Already visited URL ' , $ _ [ 0 ] , ' before.' , "\n" and return 0 ) ;
& connectivity_check ;
print 'Navigating to ' , "http://$_[0]\n" ;
$ driver - > get ( "http://$_[0]" ) ;
open my $ FILEH_A , '>:encoding(UTF-8)' , $ datadir . 'src.html' ;
open my $ FILEH_B , '>' , $ datadir . 'url.txt' ;
@ -81,35 +77,43 @@ my $FILEH_C;
open my $ FILEH_D , '>' , $ datadir . 'src_hea.txt' ; # User-agent headers.
open my $ FILEH_E , '>' , $ datadir . 'src2_hes.txt' ; # Server headers (only applicable for Curl).
open my $ FILEH_F , '>' , $ datadir . 'src2_hea.txt' ;
print 'Writing Selenium-rendered page source to ' , $ datadir . 'src.html' , "\n" ;
eval { print $ FILEH_A $ driver - > get_page_source ( "http://$_[0]" ) } ;
& connectivity_check ( $ datadir ) ;
print 'Writing Selenium screenshot to ' , $ datadir . 'scs.png' , "\n" ;
eval { $ driver - > capture_screenshot ( $ datadir . 'scs.png' , { 'full' = > 1 } ) } ;
$ easy - > setopt ( CURLOPT_URL , "http://$_[0]" ) ;
if ( - z $ datadir . 'scs.png' ) { # A zero-length screenshot happens with full-screen images. Assume non-html MIME.
open $ FILEH_C , '>' , $ datadir . 'scs.png' } else { open $ FILEH_C , '>' , $ datadir . 'src2.html' }
print 'Writing non-HTML Curl-rendered page source to ' , $ datadir . 'scs.png' , "\n" ; open $ FILEH_C , '>' , $ datadir . 'scs.png' } else { print 'Writing Curl-rendered page source to ' , $ datadir . 'src2.html' , "\n" ; open $ FILEH_C , '>' , $ datadir . 'src2.html' }
$ easy - > setopt ( CURLOPT_FILE , $ FILEH_C ) ;
print 'Writing server headers to ' , $ datadir . 'src2_hes.txt' , "\n" ;
$ easy - > setopt ( CURLOPT_HEADERDATA , $ FILEH_E ) ;
eval { $ easy - > perform ( ) } ;
print $ FILEH_B $ _ [ 1 ] ; # First, print the ip.
print $ FILEH_B "\n" ;
eval { print $ FILEH_B $ driver - > get_current_url ( ) } ; # Then print the (redirected to) browser location.
print 'Got redirected to ' , $ driver - > get_current_url ( ) , "\n" if "http://$_[0]" ne $ driver - > get_current_url ( ) ;
print $ FILEH_B "\n" ;
print $ FILEH_B 'http://' , $ _ [ 0 ] ; # Lastly, print visited domain/ip.
print $ FILEH_B "\n" ;
print 'Writing Selenium UA headers to ' , $ datadir . 'src_hea.txt' , "\n" ;
eval { print $ FILEH_D $ driver - > get_user_agent ( ) } ;
print $ FILEH_D "\n" ;
print 'Writing Curl UA headers to ' , $ datadir . 'src2_hea.txt' , "\n" ;
print $ FILEH_F $ easy_ua , "\n" ;
}
sub get_443_src {
print 'i am retarded' , "\n" ;
my $ digest = sha1_hex ( $ _ [ 0 ] ) ; # Use the sha1sum of the domain (if unavailable ip) (w/o protocol prefix) to build directories.
my $ l1 = substr ( $ digest , 0 , 1 ) ;
my $ l2 = substr ( $ digest , 1 , 1 ) ;
my $ l3 = substr ( $ digest , 2 , 1 ) ;
my $ l4 = substr ( $ digest , 3 , 1 ) ;
my $ datadir = './data/' . "$l1/$l2/$l3/$l4/$digest/" ;
make_path ( $ datadir ) or return 1 0;
make_path ( $ datadir ) or ( print 'Skipping: Already visited URL ' , $ _ [ 0 ] , ' before.' , "\n" and return 0 ) ;
& connectivity_check ;
print 'Navigating to ' , "https://$_[0]\n" ;
$ driver - > get ( "https://$_[0]" ) ;
open my $ FILEH_A , '>:encoding(UTF-8)' , $ datadir . 'src.html' ;
open my $ FILEH_B , '>' , $ datadir . 'url.txt' ;
@ -117,23 +121,29 @@ my $FILEH_C;
open my $ FILEH_D , '>' , $ datadir . 'src_hea.txt' ; # User-agent headers.
open my $ FILEH_E , '>' , $ datadir . 'src2_hes.txt' ; # Server headers (only applicable for Curl).
open my $ FILEH_F , '>' , $ datadir . 'src2_hea.txt' ;
print 'Writing Selenium-rendered page source to ' , $ datadir . 'src.html' , "\n" ;
eval { print $ FILEH_A $ driver - > get_page_source ( "https://$_[0]" ) } ;
& connectivity_check ( $ datadir ) ;
print 'Writing Selenium screenshot to ' , $ datadir . 'scs.png' , "\n" ;
eval { $ driver - > capture_screenshot ( $ datadir . 'scs.png' , { 'full' = > 1 } ) } ;
$ easy - > setopt ( CURLOPT_URL , "https://$_[0]" ) ;
if ( - z $ datadir . 'scs.png' ) { # A zero-length screenshot happens with full-screen images. Assume non-html MIME.
open $ FILEH_C , '>' , $ datadir . 'scs.png' } else { open $ FILEH_C , '>' , $ datadir . 'src2.html' }
print 'Writing non-HTML Curl-rendered page source to ' , $ datadir . 'scs.png' , "\n" ; open $ FILEH_C , '>' , $ datadir . 'scs.png' } else { print 'Writing Curl-rendered page source to ' , $ datadir . 'src2.html' , "\n" ; open $ FILEH_C , '>' , $ datadir . 'src2.html' }
$ easy - > setopt ( CURLOPT_FILE , $ FILEH_C ) ;
print 'Writing server headers to ' , $ datadir . 'src2_hes.txt' , "\n" ;
$ easy - > setopt ( CURLOPT_HEADERDATA , $ FILEH_E ) ;
eval { $ easy - > perform ( ) } ;
print $ FILEH_B $ _ [ 1 ] ; # First, print the ip.
print $ FILEH_B "\n" ;
eval { print $ FILEH_B $ driver - > get_current_url ( ) } ; # Then print the (redirected to) browser location.
print 'Got redirected to ' , $ driver - > get_current_url ( ) , "\n" if "https://$_[0]" ne $ driver - > get_current_url ( ) ;
print $ FILEH_B "\n" ;
print $ FILEH_B 'https://' , $ _ [ 0 ] ; # Lastly, print visited domain/ip.
print $ FILEH_B "\n" ;
print 'Writing Selenium UA headers to ' , $ datadir . 'src_hea.txt' , "\n" ;
eval { print $ FILEH_D $ driver - > get_user_agent ( ) } ;
print $ FILEH_D "\n" ;
print 'Writing Curl UA headers to ' , $ datadir . 'src2_hea.txt' , "\n" ;
print $ FILEH_F $ easy_ua , "\n" ;
}
@ -147,6 +157,7 @@ udp_timeout => 10
my $ reply = $ res - > search ( "$_[0]" , "PTR" ) ;
if ( $ reply ) {
foreach my $ rr ( grep { $ _ - > type eq "PTR" } $ reply - > answer ) { # Do not assume rr-objects are of the same type as requested (use grep).
print 'Got ' , $ rr - > ptrdname , "\n" ;
push @ obj , $ rr - > ptrdname ;
}
}
@ -161,20 +172,20 @@ $p->ping($host);
( my $ tmphost = $ host ) =~ s/[.][0-9.]+$// ;
$ host [ $ tmphost ] = $ host ;
}
while ( my ( $ host ) = $ p - > ack ) { push @ { $ hoa { $ _ [ 0 ] } } , $ host ; $ host =~ s/[.][0-9.]+$// ; splice @ host , $ host }
while ( my ( $ host ) = $ p - > ack ) { print 'Queueing ' , $ host , ' because of ' , $ _ [ 1 ] , '-ACK on port ' , $ _ [ 0 ] , "\n" ; push @ { $ hoa { $ _ [ 0 ] } } , $ host ; $ host =~ s/[.][0-9.]+$// ; splice @ host , $ host }
}
$ initvar = 1 ;
unless ( defined ( $ continue_from ) ) { ( $ ii1 , $ ii2 , $ ii3 , $ ii4 ) = ( 1 , 0 , 0 , 0 ) }
LABEL4: foreach my $ i4 ( 0 .. 254 ) {
LABEL4: foreach my $ i4 ( 0 .. 255 ) {
if ( $ initvar == 1 ) {
until ( $ i4 >= $ ii4 ) { next LABEL4 }
}
LABEL3: foreach my $ i3 ( 0 .. 254 ) {
LABEL3: foreach my $ i3 ( 0 .. 255 ) {
if ( $ initvar == 1 ) {
until ( int ( $ i4 * 255 ** 3 + $ i3 * 255 ** 2 ) >= int ( $ ii4 * 255 ** 3 + $ ii3 * 255 ** 2 ) ) { next LABEL3 }
}
LABEL2: foreach my $ i2 ( 0 .. 254 ) {
LABEL2: foreach my $ i2 ( 0 .. 255 ) {
if ( $ initvar == 1 ) {
until ( int ( $ i4 * 255 ** 3 + $ i3 * 255 ** 2 + $ i2 * 255 ) >= int ( $ ii4 * 255 ** 3 + $ ii3 * 255 ** 2 + $ ii2 * 255 ) ) { next LABEL2 }
}
@ -188,13 +199,13 @@ undef %hoa;
my $ host_end = $ i2 . '.' . $ i3 . '.' . $ i4 ;
$ args { 'port' } = '80' ; $ args { 'proto' } = 'tcp' ;
$ p - > port_number ( $ args { 'port' } ) ;
foreach my $ i1 ( $ iii1 .. 9 , $ iii2 .. 126 , $ iii3 .. 254 ) { # Skip large, private ipv4 blocks.
foreach my $ i1 ( $ iii1 .. 9 , $ iii2 .. 126 , $ iii3 .. 255 ) { # Skip large, private ipv4 blocks.
$ args { 'host' } = $ i1 . '.' . $ host_end ;
$ p - > ping ( $ args { 'host' } ) ;
$ host [ $ i1 ] = $ args { 'host' } ;
}
& connectivity_check ;
while ( my ( $ host ) = $ p - > ack ) { push @ { $ hoa { '80' } } , $ host ; $ host =~ s/[.][0-9.]+$// ; splice @ host , $ host }
while ( my ( $ host ) = $ p - > ack ) { print 'Queueing ' , $ host , ' because of ' , $ args { 'proto' } , '-ACK on port ' , $ args { 'port' } , "\n" ; push @ { $ hoa { '80' } } , $ host ; $ host =~ s/[.][0-9.]+$// ; splice @ host , $ host }
& connectivity_check ;
& syn_ping_elmn ( 80 , 'udp' ) ;
@ -204,19 +215,21 @@ while (my ($host) = $p->ack) {push @{$hoa{'80'}}, $host; $host =~ s/[.][0-9.]+$/
foreach my $ host ( @ { $ hoa { '80' } } ) {
next unless defined ( $ host ) ;
utime time , time , './lastip' ;
print 'Looking up PTR-records for ' , $ host , "\n" ;
my @ rray = & reverse_dns_doms ( $ host ) ;
if ( $# rray == 0 and $ rray [ 0 ] eq '' ) { eval { & get_80_src ( $ host , $ host ) } } else {
foreach ( @ rray ) {
sleep rand ( 1 ) / ( rand ( 10 ) + 1 ) ;
eval { & get_80_src ( $ _ , $ host ) } } } }
eval { & get_80_src ( $ _ , $ host ) } } } } # Catch non-fatal browser errors.
foreach my $ host ( @ { $ hoa { '443' } } ) {
next unless defined ( $ host ) ;
utime time , time , './lastip' ;
print 'Looking up PTR-records for ' , $ host , "\n" ;
my @ rray = & reverse_dns_doms ( $ host ) ;
if ( $# rray == 0 and $ rray [ 0 ] eq '' ) { eval { & get_443_src ( $ host , $ host ) } } else {
if ( $# rray == 0 and $ rray [ 0 ] eq '' ) { & get_443_src ( $ host , $ host ) } else {
foreach ( @ rray ) {
sleep rand ( 1 ) / ( rand ( 10 ) + 1 ) ;
eval { & get_443_src ( $ _ , $ host ) } } } }
eval { & get_443_src ( $ _ , $ host ) } } } } # Catch non-fatal browser errors.
utime time , time , './lastip' ;
& connectivity_check ;
$ p - > close ( ) ;