IMDB Movie Scraping
From Scrubyt
OK, here it is: an imdb movie scraper, for fun and profit (tm). Requires Scrubyt 0.3.4.
[edit] base.rb
gem 'RubyInline', '= 3.6.3'
require 'scrubyt'
module Scrapers
class Base
protected
def extractor(&block)
Scrubyt::Extractor.define(&block)
end
end
end
[edit] imdb.rb
##
# IMDB Scraper v0.2
#
# (C) 2007 Marcello Barnaba (vjt@openssl.it)
# (C) 2007 Peter Szinek (peter@rubyrailways.com)
#
# Released under the MIT License (http://www.opensource.org/licenses/mit-license.php)
#
module Scrapers
class IMDB < Base
def initialize(movie_id)
raise ArgumentError, "Invalid movie id: #{movie_id}" unless movie_id =~ /^tt\d+/
@id = movie_id
end
def scrape
movie_url = "http://imdb.com/title/#@id/"
phrase_clean_proc = lambda {|x| x.scan(/\n?.*\n(.*) (more)?/)[0][0]}
entity_clean_proc = lambda {|x| x.gsub(' ',' ')}
@scraped ||= extractor do
fetch movie_url
# The "main" div
movie '/html/body/div/div/layer/div/div' do
# The header div, containing movie title and year
name '/div/h1' do
title lambda{|x| x.gsub(/ \(\d+\)/, '')}, :type => :script
year '/span/a[1]'
end
# The cast table, containing every actor
cast '/div[3]/div[14]/table[1]' do
actor '/tr/td[2]/a', :generalize => true
end
director_div 'div[Director]' do
director_with_crap '/a' do
director '#text'
end
end
writer_div 'div[Writers]' do
writer_with_crap '/a' do
writer entity_clean_proc, :type => :script
end.select_indices(:all_but_last)
end
genre_div 'div[Genre]' do
genre_with_crap '/a' do
genre '#text'
end.select_indices(:all_but_last)
end
tagline_div 'div[Tagline]' do
tagline phrase_clean_proc, :type => :script
end
outline_div 'div[Plot Outline]' do
outline phrase_clean_proc, :type => :script
end
keyword_div 'div[Plot Keywords]' do
keyword_with_crap '/a' do
keyword entity_clean_proc, :type => :script
end.select_indices(:all_but_last)
end
runtime_div 'div[Runtime]' do
runtime phrase_clean_proc, :type => :script
end
country_div 'div[Country]' do
country_with_crap '/a' do
country entity_clean_proc, :type => :script
end
end
color_div 'div[Color]' do
color_with_crap '/a' do
color '#text'
end
end
aspect_ratio_div 'div[Aspect Ratio]' do
aspect_ration phrase_clean_proc, :type => :script
end
end.select_indices(:last) # movie
end
end
end
end
Example usage:
irb(main):001:0> require 'active_support'
=> true
irb(main):002:0> require 'base'
require=> ["Scrapers"]
irb(main):003:0> require 'imdb'
=> []
irb(main):006:0> r = Scrapers::IMDB.new('tt0434409').scrape
=> [[[[], []], [[], [], [], [], [], [], [], [], [], [], [], [], [], [], []], [[[]]], [[[]], [[]]], [[[]], [[]], [[]], [[]]], [[]], [[]], [[[]], [[]], [[]], [[]], [[]]], [[]], [[[]], [[]], [[]]], [[[]]], [[]]]]
irb(main):007:0> pp r.to_hash
[{:director=>"James McTeigue",
:title=>"V for Vendetta",
:writer=>"Andy Wachowski,Larry Wachowski",
:tagline=>
"Remember, remember the 5th of November, the gun powder treason and plot. I know of no reason why the gun powder treason should ever be forgot.",
:runtime=>"132 min",
:actor=>
"Natalie Portman,Hugo Weaving,Stephen Rea,Stephen Fry,John Hurt,Tim Pigott-Smith,Rupert Graves,Roger Allam,Ben Miles,Sinéad Cusack,Natasha Wightman,John Standing,Eddie Marsan,Clive Ashborn,Emma Field-Rayner",
:color=>"Color",
:aspect_ration=>"2.35 : 1",
:outline=>
"A shadowy freedom fighter known only as \"V\" uses terrorist tactics to fight against his totalitarian society. Upon rescuing a girl from the secret police, he also finds his best chance at having an ally.",
:keyword=>"Anti Conformity,Police Officer Killed,Toilet,Poison,Dictator",
:year=>"2005",
:country=>"USA,UK,Germany",
:genre=>"Action,Drama,Sci-Fi,Thriller"}]
=> nil
Feedback is very much appreciated, either on the discussion page or via e-mail
Enjoy!
