+++ /dev/null
-#!/usr/bin/perl -- # -*- Perl -*-\r
-\r
-# this needs some cleanup...\r
-\r
-my $PSTOTEXT = "pstotext";\r
-\r
-my $pdf = shift @ARGV;\r
-\r
-my $index = "";\r
-my $inindex = 0;\r
-open (F, "$PSTOTEXT $pdf |");\r
-while (<F>) {\r
- if (/^<\/index/) {\r
- $index .= $_;\r
- $inindex = 0;\r
- }\r
- $inindex = 1 if /^<index/;\r
-\r
- if ($inindex) {\r
- $index .= $_ if /^\s*</;\r
- }\r
-}\r
-\r
-my $cindex = "";\r
-while ($index =~ /^(.*?)((<phrase role=\"pageno\">.*?<\/phrase>\s*)+)/s) {\r
- $cindex .= $1;\r
- $_ = $2;\r
- $index = $'; # '\r
-\r
- my @pages = m/<phrase role=\"pageno\">.*?<\/phrase>\s*/sg;\r
-\r
- # Expand ranges\r
- if ($#pages >= 0) {\r
- my @mpages = ();\r
- foreach my $page (@pages) {\r
- my $pageno = &pageno($page);\r
- if ($pageno =~ /^([0-9]+)[^0-9]([0-9]+)$/) { # funky -\r
- for (my $count = $1; $count <= $2; $count++) {\r
- push (@mpages, "<phrase role=\"$pageno\">$count</phrase>");\r
- }\r
- } else {\r
- push (@mpages, $page);\r
- }\r
- }\r
- @pages = sort rangesort @mpages;\r
- }\r
-\r
- # Remove duplicates...\r
- if ($#pages > 0) {\r
- my @mpages = ();\r
- my $current = "";\r
- foreach my $page (@pages) {\r
- my $pageno = &pageno($page);\r
- if ($pageno ne $current) {\r
- push (@mpages, $page);\r
- $current = $pageno;\r
- }\r
- }\r
- @pages = @mpages;\r
- }\r
-\r
- # Collapse ranges...\r
- if ($#pages > 1) {\r
- my @cpages = ();\r
- while (@pages) {\r
- my $count = 0;\r
- my $len = &rangelen($count, @pages);\r
- if ($len <= 2) {\r
- my $page = shift @pages;\r
- push (@cpages, $page);\r
- } else {\r
- my $fpage = shift @pages;\r
- my $lpage = "";\r
- while ($len > 1) {\r
- $lpage = shift @pages;\r
- $len--;\r
- }\r
- my $fpno = &pageno($fpage);\r
- my $lpno = &pageno($lpage);\r
- $fpage =~ s/>$fpno</>${fpno}-$lpno</s;\r
- push (@cpages, $fpage);\r
- }\r
- }\r
- @pages = @cpages;\r
- }\r
-\r
- my $page = shift @pages;\r
- $page =~ s/\s*$//s;\r
- $cindex .= $page;\r
- while (@pages) {\r
- $page = shift @pages;\r
- $page =~ s/\s*$//s;\r
- $cindex .= ", $page";\r
- }\r
-}\r
-$cindex .= $index;\r
-\r
-print "$cindex\n";\r
-\r
-sub pageno {\r
- my $page = shift;\r
-\r
- $page =~ s/^<phrase.*?>//;\r
- $page =~ s/^<link.*?>//;\r
-\r
- return $1 if $page =~ /^([^<>]+)/;\r
- return "?";\r
-}\r
-\r
-sub rangesort {\r
- my $apno = &pageno($a);\r
- my $bpno = &pageno($b);\r
-\r
- # Make sure roman pages come before arabic ones, otherwise sort them in order\r
- return -1 if ($apno !~ /^\d+/ && $bpno =~ /^\d+/);\r
- return 1 if ($apno =~ /^\d+/ && $bpno !~ /^\d+/);\r
- return $apno <=> $bpno;\r
-}\r
-\r
-sub rangelen {\r
- my $count = shift;\r
- my @pages = @_;\r
- my $len = 1;\r
- my $inrange = 1;\r
-\r
- my $current = &pageno($pages[$count]);\r
- while ($count < $#pages && $inrange) {\r
- $count++;\r
- my $next = &pageno($pages[$count]);\r
- if ($current + 1 eq $next) {\r
- $current = $next;\r
- $inrange = 1;\r
- $len++;\r
- } else {\r
- $inrange = 0;\r
- }\r
- }\r
-\r
- return $len;\r
-}\r