Initial commit of OpenSPARC T2 design and verification files.
[OpenSPARC-T2-DV] / tools / perl-5.8.0 / lib / site_perl / 5.8.0 / sun4-solaris / HTML / LinkExtor.pm
CommitLineData
86530b38
AT
1package HTML::LinkExtor;
2
3=head1 NAME
4
5HTML::LinkExtor - Extract links from an HTML document
6
7=head1 SYNOPSIS
8
9 require HTML::LinkExtor;
10 $p = HTML::LinkExtor->new(\&cb, "http://www.perl.org/");
11 sub cb {
12 my($tag, %links) = @_;
13 print "$tag @{[%links]}\n";
14 }
15 $p->parse_file("index.html");
16
17=head1 DESCRIPTION
18
19I<HTML::LinkExtor> is an HTML parser that extracts links from an
20HTML document. The I<HTML::LinkExtor> is a subclass of
21I<HTML::Parser>. This means that the document should be given to the
22parser by calling the $p->parse() or $p->parse_file() methods.
23
24=cut
25
26require HTML::Parser;
27@ISA = qw(HTML::Parser);
28$VERSION = sprintf("%d.%02d", q$Revision: 1.31 $ =~ /(\d+)\.(\d+)/);
29
30use strict;
31use HTML::Tagset ();
32
33# legacy (some applications grabs this hash directly)
34use vars qw(%LINK_ELEMENT);
35*LINK_ELEMENT = \%HTML::Tagset::linkElements;
36
37=over 4
38
39=item $p = HTML::LinkExtor->new([$callback[, $base]])
40
41The constructor takes two optional arguments. The first is a reference
42to a callback routine. It will be called as links are found. If a
43callback is not provided, then links are just accumulated internally
44and can be retrieved by calling the $p->links() method.
45
46The $base argument is an optional base URL used to absolutize all URLs found.
47You need to have the I<URI> module installed if you provide $base.
48
49The callback is called with the lowercase tag name as first argument,
50and then all link attributes as separate key/value pairs. All
51non-link attributes are removed.
52
53=cut
54
55sub new
56{
57 my($class, $cb, $base) = @_;
58 my $self = $class->SUPER::new(
59 start_h => ["_start_tag", "self,tagname,attr"],
60 report_tags => [keys %HTML::Tagset::linkElements],
61 );
62 $self->{extractlink_cb} = $cb;
63 if ($base) {
64 require URI;
65 $self->{extractlink_base} = URI->new($base);
66 }
67 $self;
68}
69
70sub _start_tag
71{
72 my($self, $tag, $attr) = @_;
73
74 my $base = $self->{extractlink_base};
75 my $links = $HTML::Tagset::linkElements{$tag};
76 $links = [$links] unless ref $links;
77
78 my @links;
79 my $a;
80 for $a (@$links) {
81 next unless exists $attr->{$a};
82 push(@links, $a, $base ? URI->new($attr->{$a}, $base)->abs($base)
83 : $attr->{$a});
84 }
85 return unless @links;
86 $self->_found_link($tag, @links);
87}
88
89sub _found_link
90{
91 my $self = shift;
92 my $cb = $self->{extractlink_cb};
93 if ($cb) {
94 &$cb(@_);
95 } else {
96 push(@{$self->{'links'}}, [@_]);
97 }
98}
99
100=item $p->links
101
102Returns a list of all links found in the document. The returned
103values will be anonymous arrays with the follwing elements:
104
105 [$tag, $attr => $url1, $attr2 => $url2,...]
106
107The $p->links method will also truncate the internal link list. This
108means that if the method is called twice without any parsing
109between them the second call will return an empty list.
110
111Also note that $p->links will always be empty if a callback routine
112was provided when the I<HTML::LinkExtor> was created.
113
114=cut
115
116sub links
117{
118 my $self = shift;
119 exists($self->{'links'}) ? @{delete $self->{'links'}} : ();
120}
121
122# We override the parse_file() method so that we can clear the links
123# before we start a new file.
124sub parse_file
125{
126 my $self = shift;
127 delete $self->{'links'};
128 $self->SUPER::parse_file(@_);
129}
130
131=back
132
133=head1 EXAMPLE
134
135This is an example showing how you can extract links from a document
136received using LWP:
137
138 use LWP::UserAgent;
139 use HTML::LinkExtor;
140 use URI::URL;
141
142 $url = "http://www.perl.org/"; # for instance
143 $ua = LWP::UserAgent->new;
144
145 # Set up a callback that collect image links
146 my @imgs = ();
147 sub callback {
148 my($tag, %attr) = @_;
149 return if $tag ne 'img'; # we only look closer at <img ...>
150 push(@imgs, values %attr);
151 }
152
153 # Make the parser. Unfortunately, we don't know the base yet
154 # (it might be diffent from $url)
155 $p = HTML::LinkExtor->new(\&callback);
156
157 # Request document and parse it as it arrives
158 $res = $ua->request(HTTP::Request->new(GET => $url),
159 sub {$p->parse($_[0])});
160
161 # Expand all image URLs to absolute ones
162 my $base = $res->base;
163 @imgs = map { $_ = url($_, $base)->abs; } @imgs;
164
165 # Print them out
166 print join("\n", @imgs), "\n";
167
168=head1 SEE ALSO
169
170L<HTML::Parser>, L<HTML::Tagset>, L<LWP>, L<URI::URL>
171
172=head1 COPYRIGHT
173
174Copyright 1996-2001 Gisle Aas.
175
176This library is free software; you can redistribute it and/or
177modify it under the same terms as Perl itself.
178
179=cut
180
1811;