#!/usr/bin/ruby
require 'rubygems'
require 'json'
require 'hpricot'
require 'open-uri'

domain = "http://ffffound.com/"
user   = "blech"
type   = "found"
offset = 750

img = Array.new

while
  doc = Hpricot(open(domain+"/home/"+user+"/"+type+"/?offset="+offset.to_s+"&"))
  images = (doc/"blockquote.asset")
  puts "Got "+images.size.to_s+" images at offset "+offset.to_s
  break if (images.size == 0)
  
  images.each do |image|
    info = Hash.new
  
    # image title
    title_elem = (image/"div.title")
    info[:title] = title_elem.at("a").inner_html
  
    # original source image
    src_elem = (image/"div.title")
    info[:src_url] = src_elem.at("a")["href"]
    
    # from description, break out img url, date posted (relative!), count
    desc_elem = (image/"div.description")
    desc = desc_elem.inner_html
    info[:orig_img] = desc.gsub(/<br ?\/?>.*/, "")
  
    date     = desc.gsub(/.*<br ?\/?>/, "")
    date     = date.gsub(/<a h.*/, "")
    info[:date] = date
  
    count    = desc_elem.at("a").inner_text
    count    = count.gsub(/[\D]/, "")
    info[:count] = count
  
    # ffffound image URL and page URL, and ffffound ID (could generate
    # one from other but would lose ?c form)
    image_block = (image/"table td")
    ffffound_url = image_block.at("a")['href']
    ffffound_img = image_block.at("img")['src']
  
    id = ffffound_img
    id = ffffound_img.split('/')[6]
    id = id.gsub(/_.*/, "")
    info[:id] = id
  
    info[:ffffound_url] = ffffound_url
    info[:ffffound_img] = ffffound_img
  
    # might as well get related asset IDs
    rel = Array.new
    
    relateds = (image/"div.related_to_item_xs")
    relateds.each do |related|
      path = related.at("a")['href']
      id   = path[ path.index(/\//, 2)+1 .. -1 ]
      rel.push(id)
    end
  
    info[:rel] = rel
    img.unshift(info)
  end

  break if (images.size < 25) # more efficient than doing another fetch
  offset = offset + 25
end

puts "Got "+img.size.to_s+" images"

puts img.to_json # TODO puts image.to_database_table(s)
