diff --git a/senicup.pl b/senicup.pl index 55dae33..7a0618c 100755 --- a/senicup.pl +++ b/senicup.pl @@ -1,143 +1,153 @@ -#!/bin/env perl -#use diagnostics; +#!/usr/bin/env perl + +# Multi-UA web scraper written in Perl. Based on scanning the non-private IPv4 network in reverse order and requesting rDNS if a web-related port is open. +# (C) Roy van Lunsen + +# TODO: Add DB support for storing HTMLs, b64 screenshots, etc.. +# TODO: Utilize the DB for randomizing the scanning of IPv4 blocks supporting continuation. +# TODO: Add RE-based querying system for scoped IPv4/domain scrapes. +# TODO: Reliably detect when being blocked by companies that own (too) much network space and delay requests appropriately (might not be necessary if various parts are sufficiently randomized). + use strict; use warnings; use utf8; +use feature 'unicode_strings'; use File::Path qw(make_path); +use Digest::SHA qw/sha1_hex/; use Net::Ping; use Net::DNS; +use Net::Curl::Easy qw/:constants/; use Selenium::Firefox; use Selenium::Firefox::Profile; -#use LWP::UserAgent; +my $easy = Net::Curl::Easy->new; +my $easy_ua = 'Mozilla/5.0 (Windows NT 10.0; rv:102.0) Gecko/20100101 Firefox/102.0'; +$easy->setopt(CURLOPT_USERAGENT, $easy_ua); +$easy->setopt(CURLOPT_FOLLOWLOCATION, 1); +$easy->setopt(CURLOPT_MAXREDIRS, 10); +$easy->setopt(CURLOPT_TIMEOUT, 15); my ($continue_from, $ii1, $ii2, $ii3, $ii4, $iii1, $iii2, $iii3, $initvar, $alpha, $beta, $delta, $gamma, $retry_i); if ($ARGV[0]) { chomp($continue_from = $ARGV[0]); if ($continue_from !~ m/^[0-9]+([.][0-9]+){3}$/) {die $!, "\"$continue_from\" is not a valid ipv4 address:" } else {($ii1, $ii2, $ii3, $ii4) = split /\./, $continue_from; print 'Continuing from ipv4 address ', $continue_from, "\n"}} else {print "\n\n"; foreach (0..4) {print 'Starting a new tcp/udp 80/443 ipv4 scan in ', (5-${_}), "s.\n"; sleep 1}; print 'Starting...', "\n"} my (%args, %args2, %hoa, @host); -# Ghacks user.js is a good start. -my $profile = Selenium::Firefox::Profile->new(profile_dir => '/home/miami/.Mozilla3/Firefox/ud8j40yn.default/'); -#$profile->new('/home/miami/.Mozilla3/Firefox/ud8j40yn.default/'); -#foreach (%{%$profile{'user_prefs'}}) {print $_, "\n"} -#exit; -#my $ua = LWP::UserAgent->new(agent => 'ojffkfldnnnnsdvf'); -my $driver = Selenium::Firefox->new('firefox_profile' => $profile); -$driver->debug_on; +$args2{'profile_dir'} = '/home/miami/.Mozilla3/Firefox/ud8j40yn.default/'; # Ghacks user.js is a good start. +my $profile = Selenium::Firefox::Profile->new(%args2); +my $driver = Selenium::Firefox->new( +firefox_profile => $profile, +marionette_enabled => 1 +); $driver->set_timeout('script', 10000); $driver->set_timeout('implicit', 20000); $driver->set_timeout('page load', 15000); my $p = Net::Ping->new("syn", 3); -$driver->set_user_agent('efjnvgjkdnl'); #window.navigator.userAgent -$driver->get("https://xn--eekf.net"); -#eval {print $driver->get_current_url()} or print "hereitis: https://i.redd.it/2nynaq6qwcb91.jpg\n"; -#print $driver->screenshot({'full' => 1}); -#print $driver->get_page_source(); -print $driver->get_user_agent(); -exit; + +sub interrupt { +if ($#_ == 0) { +open my $TMP_TMP, '>', './lastip'; +print $TMP_TMP $_[0], "\n"; +close $TMP_TMP; +} else {print "no args", "\n"} +$p->close(); +$driver->shutdown_binary; +} + +$SIG{'INT'} = 'interrupt($args{"host"})'; +$SIG{'HUP'} = 'interrupt($args{"host"})'; + sub connectivity_check { chomp(my $connectivity = `ip a | grep -A 2 -Ei '^[0-9]+: wl[^:]+:' | grep -E '\\s+?inet\\b' | sed -E 's/^\\s+?inet\\s+?([^/]+).*\$/\\1/'`); # Will do for now. eval open my $TMP_FH, '<', (glob '/sys/class/net/wl*/carrier')[0]; unless ($connectivity =~ m/^[0-9]+([.][0-9]+){3}$/ and <$TMP_FH> == 1) { $retry_i = 1; -if (defined($_[0])) {unlink "$_[0]"; $_[0] =~ s,/[^/]+$,,; unlink "$_[0]/title.txt"; rmdir "$_[0]/"} # Delete potentially incomplete items from the last ipv4 address/domain, for redoing. +if (defined($_[0])) {unlink ("$_[0]/src_hea.txt","$_[0]/src_hes.txt","$_[0]/src.html","$_[0]/url.txt","$_[0]/scs.png","$_[0]/src2_hea.txt","$_[0]/src2_hes.txt","$_[0]/src2.html"); rmdir "$_[0]/"} # Delete potentially incomplete items from the last ipv4 address/domain, for redoing. } } -sub double_80_screenshot { -if ($#_ > 0) {die "$!: Too much arguments: \"$_[0]\"...\"$_[$#_]\"."} +sub get_80_src { +my $digest = sha1_hex($_[0]); # Use the sha1sum of the domain (if unavailable ip) (w/o protocol prefix) to build directories. +my $l1 = substr($digest, 0, 1); +my $l2 = substr($digest, 1, 1); +my $l3 = substr($digest, 2, 1); +my $l4 = substr($digest, 3, 1); +my $datadir = './data/'."$l1/$l2/$l3/$l4/$digest/"; +make_path($datadir) or return 10; &connectivity_check; -my $time_in_s = time; -eval {$driver->get("http://$_[0]")}; # Fetch the eye-candy. -eval {$driver->dismiss_alert}; -eval {$driver->accept_alert}; -my $current_url; -eval {$current_url = $driver->get_current_url()}; -if ($current_url) { -eval {make_path("./data/$_[0]/$current_url/")}; -open my $FILEH_B, '>', "./data/$_[0]/$current_url/screenshot-$time_in_s.png.base64"; -open my $FILEH_C, '>', "./data/$_[0]/$current_url/title.txt"; -eval {print $FILEH_B $driver->screenshot({'full' => 1})}; -print $FILEH_C $driver->get_title(); -&connectivity_check("./data/$_[0]/$current_url/screenshot-$time_in_s.png.base64"); -} else {eval {make_path("./data/$_[0]/-/")}; -open my $FILEH_B, '>', "./data/$_[0]/-/screenshot-$time_in_s.png.base64"; -open my $FILEH_C, '>', "./data/$_[0]/-/title.txt"; -eval {print $FILEH_B $driver->screenshot({'full' => 1})}; -print $FILEH_C $driver->get_title(); -&connectivity_check("./data/$_[0]/-/screenshot-$time_in_s.png.base64"); -} -$time_in_s = time; -eval {$driver->get("view-source:http://$_[0]")}; # Fetch the page source for (partly) reproduction. -undef $current_url; -eval {$current_url = $driver->get_current_url()}; -if ($current_url) { -eval {make_path("./data/$_[0]/$current_url/")}; -open my $FILEH_B, '>', "./data/$_[0]/$current_url/screenshot-$time_in_s.png.base64"; -open my $FILEH_C, '>', "./data/$_[0]/$current_url/title.txt"; -eval {print $FILEH_B $driver->screenshot({'full' => 1})}; -print $FILEH_C $driver->get_title(); -&connectivity_check("./data/$_[0]/$current_url/screenshot-$time_in_s.png.base64"); -} else {eval {make_path("./data/$_[0]/-/")}; -open my $FILEH_B, '>', "./data/$_[0]/-/screenshot-$time_in_s.png.base64"; -open my $FILEH_C, '>', "./data/$_[0]/-/title.txt"; -eval {print $FILEH_B $driver->screenshot({'full' => 1})}; -print $FILEH_C $driver->get_title(); -&connectivity_check("./data/$_[0]/-/screenshot-$time_in_s.png.base64"); -} +$driver->get("http://$_[0]"); +open my $FILEH_A, '>:encoding(UTF-8)', $datadir.'src.html'; +open my $FILEH_B, '>', $datadir.'url.txt'; +my $FILEH_C; +open my $FILEH_D, '>', $datadir.'src_hea.txt'; # User-agent headers. +open my $FILEH_E, '>', $datadir.'src2_hes.txt'; # Server headers (only applicable for Curl). +open my $FILEH_F, '>', $datadir.'src2_hea.txt'; +eval {print $FILEH_A $driver->get_page_source("http://$_[0]")}; +&connectivity_check($datadir); +eval {$driver->capture_screenshot($datadir.'scs.png', {'full' => 1})}; +$easy->setopt(CURLOPT_URL, "http://$_[0]"); +if (-z $datadir.'scs.png') { # A zero-length screenshot happens with full-screen images. Assume non-html MIME. +open $FILEH_C, '>', $datadir.'scs.png'} else {open $FILEH_C, '>', $datadir.'src2.html'} +$easy->setopt(CURLOPT_FILE, $FILEH_C); +$easy->setopt(CURLOPT_HEADERDATA, $FILEH_E); +eval {$easy->perform()}; +print $FILEH_B $_[1]; # First, print the ip. +print $FILEH_B "\n"; +eval {print $FILEH_B $driver->get_current_url()}; # Then print the (redirected to) browser location. +print $FILEH_B "\n"; +print $FILEH_B 'http://', $_[0]; # Lastly, print visited domain/ip. +print $FILEH_B "\n"; +eval {print $FILEH_D $driver->get_user_agent()}; +print $FILEH_D "\n"; +print $FILEH_F $easy_ua, "\n"; } -sub double_443_screenshot { -if ($#_ > 0) {die "$!: Too much arguments: \"$_[0]\"...\"$_[$#_]\"."} +sub get_443_src { +my $digest = sha1_hex($_[0]); # Use the sha1sum of the domain (if unavailable ip) (w/o protocol prefix) to build directories. +my $l1 = substr($digest, 0, 1); +my $l2 = substr($digest, 1, 1); +my $l3 = substr($digest, 2, 1); +my $l4 = substr($digest, 3, 1); +my $datadir = './data/'."$l1/$l2/$l3/$l4/$digest/"; +make_path($datadir) or return 10; &connectivity_check; -my $time_in_s = time; -eval {$driver->get("https://$_[0]")}; -eval {$driver->dismiss_alert}; -eval {$driver->accept_alert}; -my $current_url; -eval {$current_url = $driver->get_current_url()}; -if ($current_url) { -eval {make_path("./data/$_[0]/$current_url/")}; -open my $FILEH_B, '>', "./data/$_[0]/$current_url/screenshot-$time_in_s.png.base64"; -open my $FILEH_C, '>', "./data/$_[0]/$current_url/title.txt"; -eval {print $FILEH_B $driver->screenshot({'full' => 1})}; -print $FILEH_C $driver->get_title(); -&connectivity_check("./data/$_[0]/$current_url/screenshot-$time_in_s.png.base64"); -} else {eval {make_path("./data/$_[0]/-/")}; -open my $FILEH_B, '>', "./data/$_[0]/-/screenshot-$time_in_s.png.base64"; -open my $FILEH_C, '>', "./data/$_[0]/-/title.txt"; -eval {print $FILEH_B $driver->screenshot({'full' => 1})}; -print $FILEH_C $driver->get_title(); -&connectivity_check("./data/$_[0]/-/screenshot-$time_in_s.png.base64"); -} -$time_in_s = time; -eval {$driver->get("view-source:https://$_[0]")}; -undef $current_url; -eval {$current_url = $driver->get_current_url()}; -if ($current_url) { -eval {make_path("./data/$_[0]/$current_url/")}; -open my $FILEH_B, '>', "./data/$_[0]/$current_url/screenshot-$time_in_s.png.base64"; -open my $FILEH_C, '>', "./data/$_[0]/$current_url/title.txt"; -eval {print $FILEH_B $driver->screenshot({'full' => 1})}; -print $FILEH_C $driver->get_title(); -&connectivity_check("./data/$_[0]/$current_url/screenshot-$time_in_s.png.base64"); -} else {eval {make_path("./data/$_[0]/-/")}; -open my $FILEH_B, '>', "./data/$_[0]/-/screenshot-$time_in_s.png.base64"; -open my $FILEH_C, '>', "./data/$_[0]/-/title.txt"; -eval {print $FILEH_B $driver->screenshot({'full' => 1})}; -print $FILEH_C $driver->get_title(); -&connectivity_check("./data/$_[0]/-/screenshot-$time_in_s.png.base64"); -} +$driver->get("https://$_[0]"); +open my $FILEH_A, '>:encoding(UTF-8)', $datadir.'src.html'; +open my $FILEH_B, '>', $datadir.'url.txt'; +my $FILEH_C; +open my $FILEH_D, '>', $datadir.'src_hea.txt'; # User-agent headers. +open my $FILEH_E, '>', $datadir.'src2_hes.txt'; # Server headers (only applicable for Curl). +open my $FILEH_F, '>', $datadir.'src2_hea.txt'; +eval {print $FILEH_A $driver->get_page_source("https://$_[0]")}; +&connectivity_check($datadir); +eval {$driver->capture_screenshot($datadir.'scs.png', {'full' => 1})}; +$easy->setopt(CURLOPT_URL, "https://$_[0]"); +if (-z $datadir.'scs.png') { # A zero-length screenshot happens with full-screen images. Assume non-html MIME. +open $FILEH_C, '>', $datadir.'scs.png'} else {open $FILEH_C, '>', $datadir.'src2.html'} +$easy->setopt(CURLOPT_FILE, $FILEH_C); +$easy->setopt(CURLOPT_HEADERDATA, $FILEH_E); +eval {$easy->perform()}; +print $FILEH_B $_[1]; # First, print the ip. +print $FILEH_B "\n"; +eval {print $FILEH_B $driver->get_current_url()}; # Then print the (redirected to) browser location. +print $FILEH_B "\n"; +print $FILEH_B 'https://', $_[0]; # Lastly, print visited domain/ip. +print $FILEH_B "\n"; +eval {print $FILEH_D $driver->get_user_agent()}; +print $FILEH_D "\n"; +print $FILEH_F $easy_ua, "\n"; } sub reverse_dns_doms { my (@obj); -my $res = Net::DNS::Resolver->new; +my $res = Net::DNS::Resolver->new( +tcp_timeout => 10, +udp_timeout => 10 +); &connectivity_check; my $reply = $res->search("$_[0]", "PTR"); if ($reply) { -foreach my $rr (grep { $_->type eq "PTR" } $reply->answer) { # Do not assume rr-objects are of the same type as requested. -eval {push @obj, $rr->ptrdname}; +foreach my $rr (grep { $_->type eq "PTR" } $reply->answer) { # Do not assume rr-objects are of the same type as requested (use grep). +push @obj, $rr->ptrdname; } } return @obj; @@ -147,10 +157,9 @@ sub syn_ping_elmn { $p->port_number($_[0]); foreach my $host (@host) { next unless defined($host); -$args{'host'} = $host; -$p->ping($args{'host'}); -$host =~ s/[.][0-9.]+$//; -$host[$host] = $args{'host'}; +$p->ping($host); +(my $tmphost = $host) =~ s/[.][0-9.]+$//; +$host[$tmphost] = $host; } while (my ($host) = $p->ack) {push @{$hoa{$_[0]}}, $host; $host =~ s/[.][0-9.]+$//; splice @host, $host} } @@ -176,10 +185,11 @@ $initvar=0; $alpha = time; splice @host, 0, $#host; undef %hoa; +my $host_end = $i2.'.'.$i3.'.'.$i4; $args{'port'} = '80'; $args{'proto'} = 'tcp'; $p->port_number($args{'port'}); -foreach my $i1 ($iii1..9,$iii2..126,$iii3..254) { # Skip large private blocks. -$args{'host'} = $i1.'.'.$i2.'.'.$i3.'.'.$i4; +foreach my $i1 ($iii1..9,$iii2..126,$iii3..254) { # Skip large, private ipv4 blocks. +$args{'host'} = $i1.'.'.$host_end; $p->ping($args{'host'}); $host[$i1] = $args{'host'}; } @@ -191,32 +201,36 @@ while (my ($host) = $p->ack) {push @{$hoa{'80'}}, $host; $host =~ s/[.][0-9.]+$/ &syn_ping_elmn(443, 'udp'); &syn_ping_elmn(443, 'tcp'); -open my $FILEH_A, '>>', "./data/domains.txt"; # Here, we'll store all the domains from the ipv4 addresses. foreach my $host (@{$hoa{'80'}}) { next unless defined($host); - print $FILEH_A $host.':80'."\n"; + utime time, time, './lastip'; my @rray = &reverse_dns_doms($host); - if ($#rray == 0 and $rray[0] eq '') {&double_80_screenshot($host)} else { - foreach (@rray) {if ((-d "./data/$_/view-source:http:/$_/" or -d "./data/$_/view-source:https:/$_/") and (-d "./data/$_/http:/$_/" or -d "./data/$_/https:/$_/")) {print $FILEH_A "\n"; next}; + if ($#rray == 0 and $rray[0] eq '') {eval {&get_80_src($host, $host)}} else { + foreach (@rray) { sleep rand(1)/(rand(10)+1); - print $FILEH_A $_.','; &double_80_screenshot($_)} - print $FILEH_A "\n"} - print $FILEH_A "\n"} + eval {&get_80_src($_, $host)}}}} foreach my $host (@{$hoa{'443'}}) { next unless defined($host); - print $FILEH_A $host.':443'."\n"; + utime time, time, './lastip'; my @rray = &reverse_dns_doms($host); - if ($#rray == 0 and $rray[0] eq '') {&double_443_screenshot($host)} else { - foreach (@rray) {if (-d "./data/$_/view-source:http:/$_/" or -d "./data/$_/view-source:https:/$_/" and (-d "./data/$_/http:/$_/" or -d "./data/$_/https:/$_/")) {print $FILEH_A "\n"; next}; + if ($#rray == 0 and $rray[0] eq '') {eval {&get_443_src($host, $host)}} else { + foreach (@rray) { sleep rand(1)/(rand(10)+1); - print $FILEH_A $_.','; &double_443_screenshot($_)} - print $FILEH_A "\n"} - print $FILEH_A "\n"} + eval {&get_443_src($_, $host)}}}} +utime time, time, './lastip'; &connectivity_check; $p->close(); $beta = time; $delta = $beta-$alpha; $gamma = $i4*255**2+$i3*255**1+$i2; +if (int(rand(1000)) == 0) { # Memory usage may accumulate by visiting websites; restart once in a while. +open my $TMP_TMP, '>', './lastip'; +print $TMP_TMP $args{'host'}, "\n"; +close $TMP_TMP; +$p->close(); +$driver->shutdown_binary; +exit 0; +} if (3 > $delta or defined($retry_i)) { $retry_i = undef; print "$delta < 3.\n"; @@ -227,15 +241,16 @@ sleep 2**$exp; $exp++; chomp($connectivity = `ip a | grep -A 2 -Ei '^[0-9]+: wl[ eval open my $TMP_FH, '<', (glob '/sys/class/net/wl*/carrier')[0]; unless ($connectivity =~ m/^[0-9]+([.][0-9]+){3}$/ and <$TMP_FH> == 1) {warn "$!: No wireless connectivity on (lexicographically) first wireless network."; if ($exp >= 10) {$exp -= int(rand(11))}} else {last}} print "Retrying.\n"; +$p->close(); $p = Net::Ping->new("syn", 3); redo LABEL2; } +$p->close(); $p = Net::Ping->new("syn", 3); print 'Progress: ', $gamma/255**3*100, "%\n", 'ETA: ', (255**3-$gamma)*$delta, "s\n"; -close $FILEH_A; -print "\n", 'To continue after quitting, provide ', '1.'.$i2.'.'.$i3.'.'.$i4, ' as first argument.', "\n\n"; +print $args{'host'}, "\n"; }}} print 'Cleaning up...'."\n"; -$driver->quit(); -`killall geckodriver` and print 'Done!'."\n"; +$driver->shutdown_binary; +print 'Done!'."\n";