rebol[ Title: "xanga.r" Author: "Tim Wylie" Date: "May 18, 2007" Description: "This simply pulls the posts and comments from a xanga site, and can export to wordpress xml." ] xanga: make object![ make-comment: func[][ make object![ user: none userlink: none text: none date: none ] ] make-post: func[][ make object![ title: none text: none date: none comments: none user: none categories: none ] ] output: func[str[string!]][either value? 'ext-output [ext-output str][print str]] make-date: func[dt [string!]/locals tmp tm][ tmp: parse dt none tmp_dt: parse tmp/2 "/" either tmp/4 = "PM" [either (not find tmp/3 "12:") [tm: (to-time tmp/3) + 12:00][tm: tmp/3]] [either (find tmp/3 "12:")[tm: (to-time tmp/3) - 12:00][tm: tmp/3]] to-date rejoin[tmp_dt/2 "/" tmp_dt/1 "/" tmp_dt/3 " " tm] ] ;this gets the data from each comment get-comment: func[page [string!] /locals coms com usr usrlnk txt dt][ if error? try[ com: make-comment page: find page {
copy txt to
(com/text: txt) thru
copy dt string! (com/date: make-date first dt) copy a tag!(parse to-string a ["a" thru "href=" [{"} copy usrlnk to {"} | copy usrlnk to ">"] to end] com/userlink: usrlnk) copy usr string!(com/user: usr) to end ][ com/userlink: to-url rejoin["http://www.xanga.com/" com/user] return com ][output "Error parsing comment"] ][output rejoin["Error getting comment"]] ] ;this function loops through the page to find all the comments and calls the get-comment function to get the data get-comments: func[page [string!] /locals coms][ coms: copy [] while[page: find page {
} either parse load/markup page_mrk [ thru
any tag! copy dt string!(blog/date: dt) [thru

[copy tt string! some tag! (blog/title: tt)| some tag! (blog/title: none)] | thru some tag! (blog/title: none)] copy txt to (blog/text: txt) thru copy tm string! (blog/date: make-date first tm) to end ][ blog/comments: get-comments page_mrk blog/categories: ["xanga"] output rejoin["Retrieved Post From " blog/date] append posts blog ][output rejoin["Error parsing post: " page]] ][output rejoin["Error getting post " page]] ] ;this loops through all the post urls and calls the get-post function get-posts: func[][foreach l links[get-post to-url l] output rejoin["Finished getting posts"]] ;this function takes a page and strips out the links to all the comment links for the individual posts get-pagelist: func[site[url!] /locals site_mrk link links_found][ if error? try[ links_found: 0 site_mrk: load/markup site forall site_mrk[ unless tag? (first site_mrk)[ if find (first site_mrk) "comment"[ if parse first back site_mrk [ "a" thru "href=" [{"} copy link to {"} | copy link to ">"] to end ][append links link links_found: links_found + 1] ] ] ] output rejoin["Page: " site " Links: " links_found] ][output rejoin["Error getting pagelist from " site]] ] ;this gets a page and passes it to the get-pagelist function, then gets the next xanga page of posts(the next 5) get-pages: func[site[url!] /locals site_mrk link][ if error? try[ get-pagelist site pages: pages + 1 site_mrk: load/markup site forall site_mrk[ unless tag? (first site_mrk)[ if find (first site_mrk) "Next 5 >>"[ if parse first back site_mrk [ "a" thru "href=" [{"} copy link to {"} | copy link to ">"] to end ][get-pages to-url rejoin[http://www.xanga.com link]] ] ] ] ][output rejoin["Error getting pages from " site]] ] ;this saves all the posts in a file in a rebol object format save-posts: func[filename[file!]][save/all filename posts output "Data saved"];save filename posts output "Data saved"] load-posts: func[filename[file!]][ ;posts: reduce load filename ;foreach post posts[post/comments: reduce post/comments] posts: load/all filename output "Data loaded"] get-weekday: func[day[integer!]][copy/part (pick system/locale/days day) 3] get-month: func[mon[integer!]][copy/part (pick system/locale/months mon) 3] pad-two: func[dt[integer!]][if dt < 10[dt: rejoin["0" dt]]to-string dt] export-wpxml: func[filename[file!] /title ttl[string!] /url ul[url!]][ wp-xml: copy [] post-id: 1 comment-id: 1 output rejoin["Starting to export data to " filename] repend wp-xml [ { } either title[ttl][""] {} either url[ul][""] { } (get-weekday now/weekday) ", " now/day " " (get-month now/month) " " now/year" " now/time " +0000"{ en xanga00} ] foreach post posts[ repend wp-xml [ { }either post/title[post/title][post/date/date] { } either url[ul][""] "/" post/date/year "/" (pad-two post/date/month) "/" (pad-two post/date/day){/ }get-weekday post/date/weekday ", " post/date/day" " get-month post/date/month " " post/date/year " " post/date/time":00"{ +0000 }either post/user[post/user][""] { } either url[ul][""] "/" post/date/year "/" (pad-two post/date/month) "/" (pad-two post/date/day){/ }post-id{ }post/date/year "-"post/date/month "-" post/date/day " " post/date/time{:00 }post/date/year "-"post/date/month "-" post/date/day " " (post/date/time + post/date/zone){:00 open closed }post-id{ publish 0 0 post} newline ] foreach com post/comments[ ;gmt: com/date + com/date/zone unless none? com[ repend wp-xml [ { }comment-id{ }com/user{ }com/userlink{ }com/date/year "-" com/date/month "-" com/date/day " " com/date/time{:00 }com/date/year "-" com/date/month "-" com/date/day " " (com/date/time + com/date/zone){:00 }com/text{ 1 0 } ] comment-id: comment-id + 1 ] ] append wp-xml rejoin[newline ""] post-id: post-id + 1 ] append wp-xml rejoin[newline "" newline ""] write filename wp-xml output "Finished exporting data" ] post-data?: func[][either (length? posts) > 0[return true][return false]] ;this dumps the entire site clear-data: func[][ clear links clear posts links: copy [] posts: copy [] pages: 0 ] export: func[site[url!]][ links: copy [] posts: copy [] pages: 0 get-pages site get-posts output rejoin[(length? posts) " posts stored"] ] links: copy [] pages: 0 posts: copy [] ] ;typical use ;xanga/export http://www.xanga.com/username ;xanga/save-posts %posts.r ;xanga/export-wpxml/url/title %xanga.xml http://www.whatever.com/ "The Title"