Kan beter onderbroken worden (alles met mate).

main
Roy 2 years ago
parent af93ecb85b
commit 543bc2d44c
  1. 265
      senicup.pl

@ -1,143 +1,153 @@
#!/bin/env perl
#use diagnostics;
#!/usr/bin/env perl
# Multi-UA web scraper written in Perl. Based on scanning the non-private IPv4 network in reverse order and requesting rDNS if a web-related port is open.
# (C) Roy van Lunsen
# TODO: Add DB support for storing HTMLs, b64 screenshots, etc..
# TODO: Utilize the DB for randomizing the scanning of IPv4 blocks supporting continuation.
# TODO: Add RE-based querying system for scoped IPv4/domain scrapes.
# TODO: Reliably detect when being blocked by companies that own (too) much network space and delay requests appropriately (might not be necessary if various parts are sufficiently randomized).
use strict;
use warnings;
use utf8;
use feature 'unicode_strings';
use File::Path qw(make_path);
use Digest::SHA qw/sha1_hex/;
use Net::Ping;
use Net::DNS;
use Net::Curl::Easy qw/:constants/;
use Selenium::Firefox;
use Selenium::Firefox::Profile;
#use LWP::UserAgent;
my $easy = Net::Curl::Easy->new;
my $easy_ua = 'Mozilla/5.0 (Windows NT 10.0; rv:102.0) Gecko/20100101 Firefox/102.0';
$easy->setopt(CURLOPT_USERAGENT, $easy_ua);
$easy->setopt(CURLOPT_FOLLOWLOCATION, 1);
$easy->setopt(CURLOPT_MAXREDIRS, 10);
$easy->setopt(CURLOPT_TIMEOUT, 15);
my ($continue_from, $ii1, $ii2, $ii3, $ii4, $iii1, $iii2, $iii3, $initvar, $alpha, $beta, $delta, $gamma, $retry_i);
if ($ARGV[0]) {
chomp($continue_from = $ARGV[0]);
if ($continue_from !~ m/^[0-9]+([.][0-9]+){3}$/) {die $!, "\"$continue_from\" is not a valid ipv4 address:"
} else {($ii1, $ii2, $ii3, $ii4) = split /\./, $continue_from; print 'Continuing from ipv4 address ', $continue_from, "\n"}} else {print "\n\n"; foreach (0..4) {print 'Starting a new tcp/udp 80/443 ipv4 scan in ', (5-${_}), "s.\n"; sleep 1}; print 'Starting...', "\n"}
my (%args, %args2, %hoa, @host);
# Ghacks user.js is a good start.
my $profile = Selenium::Firefox::Profile->new(profile_dir => '/home/miami/.Mozilla3/Firefox/ud8j40yn.default/');
#$profile->new('/home/miami/.Mozilla3/Firefox/ud8j40yn.default/');
#foreach (%{%$profile{'user_prefs'}}) {print $_, "\n"}
#exit;
#my $ua = LWP::UserAgent->new(agent => 'ojffkfldnnnnsdvf');
my $driver = Selenium::Firefox->new('firefox_profile' => $profile);
$driver->debug_on;
$args2{'profile_dir'} = '/home/miami/.Mozilla3/Firefox/ud8j40yn.default/'; # Ghacks user.js is a good start.
my $profile = Selenium::Firefox::Profile->new(%args2);
my $driver = Selenium::Firefox->new(
firefox_profile => $profile,
marionette_enabled => 1
);
$driver->set_timeout('script', 10000);
$driver->set_timeout('implicit', 20000);
$driver->set_timeout('page load', 15000);
my $p = Net::Ping->new("syn", 3);
$driver->set_user_agent('efjnvgjkdnl'); #window.navigator.userAgent
$driver->get("https://xn--eekf.net");
#eval {print $driver->get_current_url()} or print "hereitis: https://i.redd.it/2nynaq6qwcb91.jpg\n";
#print $driver->screenshot({'full' => 1});
#print $driver->get_page_source();
print $driver->get_user_agent();
exit;
sub interrupt {
if ($#_ == 0) {
open my $TMP_TMP, '>', './lastip';
print $TMP_TMP $_[0], "\n";
close $TMP_TMP;
} else {print "no args", "\n"}
$p->close();
$driver->shutdown_binary;
}
$SIG{'INT'} = 'interrupt($args{"host"})';
$SIG{'HUP'} = 'interrupt($args{"host"})';
sub connectivity_check {
chomp(my $connectivity = `ip a | grep -A 2 -Ei '^[0-9]+: wl[^:]+:' | grep -E '\\s+?inet\\b' | sed -E 's/^\\s+?inet\\s+?([^/]+).*\$/\\1/'`); # Will do for now.
eval open my $TMP_FH, '<', (glob '/sys/class/net/wl*/carrier')[0];
unless ($connectivity =~ m/^[0-9]+([.][0-9]+){3}$/ and <$TMP_FH> == 1) {
$retry_i = 1;
if (defined($_[0])) {unlink "$_[0]"; $_[0] =~ s,/[^/]+$,,; unlink "$_[0]/title.txt"; rmdir "$_[0]/"} # Delete potentially incomplete items from the last ipv4 address/domain, for redoing.
if (defined($_[0])) {unlink ("$_[0]/src_hea.txt","$_[0]/src_hes.txt","$_[0]/src.html","$_[0]/url.txt","$_[0]/scs.png","$_[0]/src2_hea.txt","$_[0]/src2_hes.txt","$_[0]/src2.html"); rmdir "$_[0]/"} # Delete potentially incomplete items from the last ipv4 address/domain, for redoing.
}
}
sub double_80_screenshot {
if ($#_ > 0) {die "$!: Too much arguments: \"$_[0]\"...\"$_[$#_]\"."}
sub get_80_src {
my $digest = sha1_hex($_[0]); # Use the sha1sum of the domain (if unavailable ip) (w/o protocol prefix) to build directories.
my $l1 = substr($digest, 0, 1);
my $l2 = substr($digest, 1, 1);
my $l3 = substr($digest, 2, 1);
my $l4 = substr($digest, 3, 1);
my $datadir = './data/'."$l1/$l2/$l3/$l4/$digest/";
make_path($datadir) or return 10;
&connectivity_check;
my $time_in_s = time;
eval {$driver->get("http://$_[0]")}; # Fetch the eye-candy.
eval {$driver->dismiss_alert};
eval {$driver->accept_alert};
my $current_url;
eval {$current_url = $driver->get_current_url()};
if ($current_url) {
eval {make_path("./data/$_[0]/$current_url/")};
open my $FILEH_B, '>', "./data/$_[0]/$current_url/screenshot-$time_in_s.png.base64";
open my $FILEH_C, '>', "./data/$_[0]/$current_url/title.txt";
eval {print $FILEH_B $driver->screenshot({'full' => 1})};
print $FILEH_C $driver->get_title();
&connectivity_check("./data/$_[0]/$current_url/screenshot-$time_in_s.png.base64");
} else {eval {make_path("./data/$_[0]/-/")};
open my $FILEH_B, '>', "./data/$_[0]/-/screenshot-$time_in_s.png.base64";
open my $FILEH_C, '>', "./data/$_[0]/-/title.txt";
eval {print $FILEH_B $driver->screenshot({'full' => 1})};
print $FILEH_C $driver->get_title();
&connectivity_check("./data/$_[0]/-/screenshot-$time_in_s.png.base64");
}
$time_in_s = time;
eval {$driver->get("view-source:http://$_[0]")}; # Fetch the page source for (partly) reproduction.
undef $current_url;
eval {$current_url = $driver->get_current_url()};
if ($current_url) {
eval {make_path("./data/$_[0]/$current_url/")};
open my $FILEH_B, '>', "./data/$_[0]/$current_url/screenshot-$time_in_s.png.base64";
open my $FILEH_C, '>', "./data/$_[0]/$current_url/title.txt";
eval {print $FILEH_B $driver->screenshot({'full' => 1})};
print $FILEH_C $driver->get_title();
&connectivity_check("./data/$_[0]/$current_url/screenshot-$time_in_s.png.base64");
} else {eval {make_path("./data/$_[0]/-/")};
open my $FILEH_B, '>', "./data/$_[0]/-/screenshot-$time_in_s.png.base64";
open my $FILEH_C, '>', "./data/$_[0]/-/title.txt";
eval {print $FILEH_B $driver->screenshot({'full' => 1})};
print $FILEH_C $driver->get_title();
&connectivity_check("./data/$_[0]/-/screenshot-$time_in_s.png.base64");
}
$driver->get("http://$_[0]");
open my $FILEH_A, '>:encoding(UTF-8)', $datadir.'src.html';
open my $FILEH_B, '>', $datadir.'url.txt';
my $FILEH_C;
open my $FILEH_D, '>', $datadir.'src_hea.txt'; # User-agent headers.
open my $FILEH_E, '>', $datadir.'src2_hes.txt'; # Server headers (only applicable for Curl).
open my $FILEH_F, '>', $datadir.'src2_hea.txt';
eval {print $FILEH_A $driver->get_page_source("http://$_[0]")};
&connectivity_check($datadir);
eval {$driver->capture_screenshot($datadir.'scs.png', {'full' => 1})};
$easy->setopt(CURLOPT_URL, "http://$_[0]");
if (-z $datadir.'scs.png') { # A zero-length screenshot happens with full-screen images. Assume non-html MIME.
open $FILEH_C, '>', $datadir.'scs.png'} else {open $FILEH_C, '>', $datadir.'src2.html'}
$easy->setopt(CURLOPT_FILE, $FILEH_C);
$easy->setopt(CURLOPT_HEADERDATA, $FILEH_E);
eval {$easy->perform()};
print $FILEH_B $_[1]; # First, print the ip.
print $FILEH_B "\n";
eval {print $FILEH_B $driver->get_current_url()}; # Then print the (redirected to) browser location.
print $FILEH_B "\n";
print $FILEH_B 'http://', $_[0]; # Lastly, print visited domain/ip.
print $FILEH_B "\n";
eval {print $FILEH_D $driver->get_user_agent()};
print $FILEH_D "\n";
print $FILEH_F $easy_ua, "\n";
}
sub double_443_screenshot {
if ($#_ > 0) {die "$!: Too much arguments: \"$_[0]\"...\"$_[$#_]\"."}
sub get_443_src {
my $digest = sha1_hex($_[0]); # Use the sha1sum of the domain (if unavailable ip) (w/o protocol prefix) to build directories.
my $l1 = substr($digest, 0, 1);
my $l2 = substr($digest, 1, 1);
my $l3 = substr($digest, 2, 1);
my $l4 = substr($digest, 3, 1);
my $datadir = './data/'."$l1/$l2/$l3/$l4/$digest/";
make_path($datadir) or return 10;
&connectivity_check;
my $time_in_s = time;
eval {$driver->get("https://$_[0]")};
eval {$driver->dismiss_alert};
eval {$driver->accept_alert};
my $current_url;
eval {$current_url = $driver->get_current_url()};
if ($current_url) {
eval {make_path("./data/$_[0]/$current_url/")};
open my $FILEH_B, '>', "./data/$_[0]/$current_url/screenshot-$time_in_s.png.base64";
open my $FILEH_C, '>', "./data/$_[0]/$current_url/title.txt";
eval {print $FILEH_B $driver->screenshot({'full' => 1})};
print $FILEH_C $driver->get_title();
&connectivity_check("./data/$_[0]/$current_url/screenshot-$time_in_s.png.base64");
} else {eval {make_path("./data/$_[0]/-/")};
open my $FILEH_B, '>', "./data/$_[0]/-/screenshot-$time_in_s.png.base64";
open my $FILEH_C, '>', "./data/$_[0]/-/title.txt";
eval {print $FILEH_B $driver->screenshot({'full' => 1})};
print $FILEH_C $driver->get_title();
&connectivity_check("./data/$_[0]/-/screenshot-$time_in_s.png.base64");
}
$time_in_s = time;
eval {$driver->get("view-source:https://$_[0]")};
undef $current_url;
eval {$current_url = $driver->get_current_url()};
if ($current_url) {
eval {make_path("./data/$_[0]/$current_url/")};
open my $FILEH_B, '>', "./data/$_[0]/$current_url/screenshot-$time_in_s.png.base64";
open my $FILEH_C, '>', "./data/$_[0]/$current_url/title.txt";
eval {print $FILEH_B $driver->screenshot({'full' => 1})};
print $FILEH_C $driver->get_title();
&connectivity_check("./data/$_[0]/$current_url/screenshot-$time_in_s.png.base64");
} else {eval {make_path("./data/$_[0]/-/")};
open my $FILEH_B, '>', "./data/$_[0]/-/screenshot-$time_in_s.png.base64";
open my $FILEH_C, '>', "./data/$_[0]/-/title.txt";
eval {print $FILEH_B $driver->screenshot({'full' => 1})};
print $FILEH_C $driver->get_title();
&connectivity_check("./data/$_[0]/-/screenshot-$time_in_s.png.base64");
}
$driver->get("https://$_[0]");
open my $FILEH_A, '>:encoding(UTF-8)', $datadir.'src.html';
open my $FILEH_B, '>', $datadir.'url.txt';
my $FILEH_C;
open my $FILEH_D, '>', $datadir.'src_hea.txt'; # User-agent headers.
open my $FILEH_E, '>', $datadir.'src2_hes.txt'; # Server headers (only applicable for Curl).
open my $FILEH_F, '>', $datadir.'src2_hea.txt';
eval {print $FILEH_A $driver->get_page_source("https://$_[0]")};
&connectivity_check($datadir);
eval {$driver->capture_screenshot($datadir.'scs.png', {'full' => 1})};
$easy->setopt(CURLOPT_URL, "https://$_[0]");
if (-z $datadir.'scs.png') { # A zero-length screenshot happens with full-screen images. Assume non-html MIME.
open $FILEH_C, '>', $datadir.'scs.png'} else {open $FILEH_C, '>', $datadir.'src2.html'}
$easy->setopt(CURLOPT_FILE, $FILEH_C);
$easy->setopt(CURLOPT_HEADERDATA, $FILEH_E);
eval {$easy->perform()};
print $FILEH_B $_[1]; # First, print the ip.
print $FILEH_B "\n";
eval {print $FILEH_B $driver->get_current_url()}; # Then print the (redirected to) browser location.
print $FILEH_B "\n";
print $FILEH_B 'https://', $_[0]; # Lastly, print visited domain/ip.
print $FILEH_B "\n";
eval {print $FILEH_D $driver->get_user_agent()};
print $FILEH_D "\n";
print $FILEH_F $easy_ua, "\n";
}
sub reverse_dns_doms {
my (@obj);
my $res = Net::DNS::Resolver->new;
my $res = Net::DNS::Resolver->new(
tcp_timeout => 10,
udp_timeout => 10
);
&connectivity_check;
my $reply = $res->search("$_[0]", "PTR");
if ($reply) {
foreach my $rr (grep { $_->type eq "PTR" } $reply->answer) { # Do not assume rr-objects are of the same type as requested.
eval {push @obj, $rr->ptrdname};
foreach my $rr (grep { $_->type eq "PTR" } $reply->answer) { # Do not assume rr-objects are of the same type as requested (use grep).
push @obj, $rr->ptrdname;
}
}
return @obj;
@ -147,10 +157,9 @@ sub syn_ping_elmn {
$p->port_number($_[0]);
foreach my $host (@host) {
next unless defined($host);
$args{'host'} = $host;
$p->ping($args{'host'});
$host =~ s/[.][0-9.]+$//;
$host[$host] = $args{'host'};
$p->ping($host);
(my $tmphost = $host) =~ s/[.][0-9.]+$//;
$host[$tmphost] = $host;
}
while (my ($host) = $p->ack) {push @{$hoa{$_[0]}}, $host; $host =~ s/[.][0-9.]+$//; splice @host, $host}
}
@ -176,10 +185,11 @@ $initvar=0;
$alpha = time;
splice @host, 0, $#host;
undef %hoa;
my $host_end = $i2.'.'.$i3.'.'.$i4;
$args{'port'} = '80'; $args{'proto'} = 'tcp';
$p->port_number($args{'port'});
foreach my $i1 ($iii1..9,$iii2..126,$iii3..254) { # Skip large private blocks.
$args{'host'} = $i1.'.'.$i2.'.'.$i3.'.'.$i4;
foreach my $i1 ($iii1..9,$iii2..126,$iii3..254) { # Skip large, private ipv4 blocks.
$args{'host'} = $i1.'.'.$host_end;
$p->ping($args{'host'});
$host[$i1] = $args{'host'};
}
@ -191,32 +201,36 @@ while (my ($host) = $p->ack) {push @{$hoa{'80'}}, $host; $host =~ s/[.][0-9.]+$/
&syn_ping_elmn(443, 'udp');
&syn_ping_elmn(443, 'tcp');
open my $FILEH_A, '>>', "./data/domains.txt"; # Here, we'll store all the domains from the ipv4 addresses.
foreach my $host (@{$hoa{'80'}}) {
next unless defined($host);
print $FILEH_A $host.':80'."\n";
utime time, time, './lastip';
my @rray = &reverse_dns_doms($host);
if ($#rray == 0 and $rray[0] eq '') {&double_80_screenshot($host)} else {
foreach (@rray) {if ((-d "./data/$_/view-source:http:/$_/" or -d "./data/$_/view-source:https:/$_/") and (-d "./data/$_/http:/$_/" or -d "./data/$_/https:/$_/")) {print $FILEH_A "\n"; next};
if ($#rray == 0 and $rray[0] eq '') {eval {&get_80_src($host, $host)}} else {
foreach (@rray) {
sleep rand(1)/(rand(10)+1);
print $FILEH_A $_.','; &double_80_screenshot($_)}
print $FILEH_A "\n"}
print $FILEH_A "\n"}
eval {&get_80_src($_, $host)}}}}
foreach my $host (@{$hoa{'443'}}) {
next unless defined($host);
print $FILEH_A $host.':443'."\n";
utime time, time, './lastip';
my @rray = &reverse_dns_doms($host);
if ($#rray == 0 and $rray[0] eq '') {&double_443_screenshot($host)} else {
foreach (@rray) {if (-d "./data/$_/view-source:http:/$_/" or -d "./data/$_/view-source:https:/$_/" and (-d "./data/$_/http:/$_/" or -d "./data/$_/https:/$_/")) {print $FILEH_A "\n"; next};
if ($#rray == 0 and $rray[0] eq '') {eval {&get_443_src($host, $host)}} else {
foreach (@rray) {
sleep rand(1)/(rand(10)+1);
print $FILEH_A $_.','; &double_443_screenshot($_)}
print $FILEH_A "\n"}
print $FILEH_A "\n"}
eval {&get_443_src($_, $host)}}}}
utime time, time, './lastip';
&connectivity_check;
$p->close();
$beta = time;
$delta = $beta-$alpha;
$gamma = $i4*255**2+$i3*255**1+$i2;
if (int(rand(1000)) == 0) { # Memory usage may accumulate by visiting websites; restart once in a while.
open my $TMP_TMP, '>', './lastip';
print $TMP_TMP $args{'host'}, "\n";
close $TMP_TMP;
$p->close();
$driver->shutdown_binary;
exit 0;
}
if (3 > $delta or defined($retry_i)) {
$retry_i = undef;
print "$delta < 3.\n";
@ -227,15 +241,16 @@ sleep 2**$exp; $exp++; chomp($connectivity = `ip a | grep -A 2 -Ei '^[0-9]+: wl[
eval open my $TMP_FH, '<', (glob '/sys/class/net/wl*/carrier')[0];
unless ($connectivity =~ m/^[0-9]+([.][0-9]+){3}$/ and <$TMP_FH> == 1) {warn "$!: No wireless connectivity on (lexicographically) first wireless network."; if ($exp >= 10) {$exp -= int(rand(11))}} else {last}}
print "Retrying.\n";
$p->close();
$p = Net::Ping->new("syn", 3);
redo LABEL2;
}
$p->close();
$p = Net::Ping->new("syn", 3);
print 'Progress: ', $gamma/255**3*100, "%\n", 'ETA: ', (255**3-$gamma)*$delta, "s\n";
close $FILEH_A;
print "\n", 'To continue after quitting, provide ', '1.'.$i2.'.'.$i3.'.'.$i4, ' as first argument.', "\n\n";
print $args{'host'}, "\n";
}}}
print 'Cleaning up...'."\n";
$driver->quit();
`killall geckodriver` and print 'Done!'."\n";
$driver->shutdown_binary;
print 'Done!'."\n";

Loading…
Cancel
Save