Short PERL SOURCE EXAMPLE


This PERL example is a little program that gets source code for a web page specified in its command line, strips the urls from that page and prints them to STDOUT.


#!/usr/bin/perl
#
# Program Name: getPage.pl
# Use: getPage.pl url
# description: This program uses the LWP::Simple module. It will
#  get a web page passed to it on the command line then strip out
#  and list all the urls on that page, returning the list to STDOUT.
#  The program will canonize the url. For example http://x.y.z will
#  have a slash appended to it, while http://x.y.z/mypage.html will not.
#  Also, local references will be expanded to include the source page url.
#  For example: href="localpage.htm" will have http://x.y.z/ prepended to
#  the local reference.
# 
#
###########
use strict;
use LWP::Simple;

my $pg;
my $basepg;
my $url;

# Get the URL or exit with useage message
if (! defined $ARGV[0])
{
   print "Usage is 'getPageURL.pl URL'\n";
   exit;
}else{
   $pg = $ARGV[0];
   chomp($pg);
}

# Canonize the initial url
# If a specific html file is specified, get the base page...

if ($pg =~ /html$|htm$/)
{
   $pg =~ m"([\d\w\.:/]+/)";
   $basepg = $1;
}else{
   if ( $pg !~ m"/$"i )
   {
      $basepg = $pg = $pg . "/";
   }else{
      $basepg = $pg;
   }
}

# Go get the page. This "get" is made available by LWP::Simple
my $webpg = get($pg) or die "no page $pg\n";

# Split page into lines load @webpgArry
my (@webpgArry) = split(/\n/, $webpg);

# Strip out the urls named on the page load into @urls
my @urls = map { /href="(.+?)"/ } @webpgArry; # ? for minimal match otw can match target="_blank" etc

# de-localize a local page url if necessary and print, but don't print mailto links
foreach $url (@urls)
{   
   next if ($url =~ /mailto/i);
   if ($url !~ /http|https|ftp/i)
   {
      print $basepg . $url . "\n";
   }else{
      print $url . "\n";
   }
}


exit;                 


  • Mail to mjr