Thanks Kevin for responding. That did not give me what I was looking for but further searching on Google did the trick. I had to mash a number of different scripts together, but can now efficiently process hundreds of Google Starred links and dump the selected links to a text file for further processing. It is in Ruby, and you may be able to convert it to perl for us?
Your Google Starred items must be set for Public access, and you can find the [bold]xxxxxxxxxxxxxxxxxxxxx[/bold] Google Reader Account Number in the public web link when you change the Starred view to public (Under Settings->Folder & Tags in Google Reader)
I start the script with:
[bold]ruby GRS.rb [skip][/bold]
[skip] is optional if you have already parsed the Google Links and want to redo the download links
#############################
# START OF SCRIPT
#############################
require 'net/http'
require 'uri'
require 'open-uri'
require 'rubygems'
require 'hpricot'
require 'simple-rss'
# Check if you wish to skip parsing Google Reader again
if ARGV[0] == "skip"
# Check if GoogleReaderStarredLinks.txt exists
#File.open("GoogleReaderStarredLinks.txt")
if File::exists?( "GoogleReaderStarredLinks.txt" )
puts "Skipping Google Reader parsing"
else
# If File does not exists, parse Google Reader RSS feed with simpleRSS and place it in a text file for further processing
feed = "
[bold]xxxxxxxxxxxxxxxxxxxxx[/bold]/state/com.google/starred?n=500"
rss = SimpleRSS.parse open(feed)
rss.entries.each do |item|
open('GoogleReaderStarredLinks.txt', 'a') { |f|
f.puts item.link
f.close
}
end
end
else
# If File does not exists, parse Google Reader RSS feed with simpleRSS and place it in a text file for further processing
feed = "
[bold]xxxxxxxxxxxxxxxxxxxxx[/bold]/state/com.google/starred?n=500"
rss = SimpleRSS.parse open(feed)
rss.entries.each do |item|
open('GoogleReaderStarredLinks.txt', 'a') { |f|
f.puts item.link
f.close
}
end
end
#include UrlUtils
# Push all URL's in file to an Array. This could have been done directly, but will
# not allow a repeat without having to parse Google Reader again, which takes time and bandwidth
urls = []
File.open('GoogleReaderStarredLinks.txt', 'r') do |file|
file.readlines.each do |line|
urls.push(line.chomp)
end
end
# Loop through each of the URL's in the Array urls[]
urls.each do |url|
puts "Google Reader Link : " + url
# Open the URL and check for errors (timeouts and HTTP)
# If any, skip to the next URL
begin
url_object = open(url)
rescue Timeout::Error
puts "The request for a page at #{url} timed out...skipping."
next
rescue OpenURI::HTTPError
puts "The request for a page at #{url} returned an error"
next
end
# Parse the link with Hpricot i.e. open the webpage linked to the original URL and
# read that into a variable doc, which holds an equivalent of the webpage source code
next if url_object == nil
doc = nil
doc = Hpricot(url_object)
# Look for an URL link
doc.search('a[@href]').map do |x|
new_url = x['href'].split('#')[0]
unless new_url == nil
# Checks if the webpage contains a link to one of the online file storage servers I have an account with
# and put the link into a text file for further processing, and loop to the next URL in the webpage
if new_url.include? 'abc.com/files'
open('DownloadLinks.txt', 'a') { |f|
f.puts new_url
puts " Download link : " + new_url
}
next
elsif new_url.include? 'pqr.com'
open('DownloadLinks.txt', 'a') { |f|
f.puts new_url
puts " Download link : " + new_url
}
next
elsif new_url.include? 'xyz.com'
open('DownloadLinks.txt', 'a') { |f|
f.puts new_url
puts " Download link : " + new_url
}
next
end
end
end
end
#############################
# START OF SCRIPT
#############################