Oct 12, 2014

Generate a RSS Feed from any Webpage

When a friend wrote for Softonic.com on a whole range of web and software-related topics, I wanted to follow his writing, preferably with a RSS reader. The site didn't offer a feed filtered by author, so with the help and guide of ScrapeNFeed I had an excuse to use a bit of Python.

A fine feed-generating online service exists that I got aware of some later time: Feed43

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
#!/usr/bin/env python
#
# can do: heading, text-preview, full article text, author name, publishing date, category tags, user-agent spoofing
# cant do: csv writeout, rate-limiting in ms/s, num_comments as integer
#
# download release at https://www.crummy.com/software/ScrapeNFeed/ into lib/

from bs4 import BeautifulSoup
from urllib2 import urlopen, Request
from PyRSS2Gen import RSSItem, Guid, Category
import lib.ScrapeNFeed as ScrapeNFeed
import datetime, re

rooturl = 'http://news.softonic.de/editor/john-doe/'
headers = { 'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:32.0) Gecko/20100101 Firefox/32.0' }

soup_html = ''

for number in range(1,3):
    url = rooturl + str(number)
    req = Request(url, None, headers)
    soup_html = soup_html + urlopen(req).read()

soup = BeautifulSoup(soup_html)

class SoftonicFeed(ScrapeNFeed.ScrapedFeed):
    def HTML2RSS(self, headers, body):

    newsitems = soup.find_all("article", class_="card-list")

    items = []
    for item in newsitems:
        link = item.h3.a['href']
        if not self.hasSeen(link):
        headline = item.h3.string

        author = item.find("a", rel="author").string
        authordate = item.find("a", rel="author").parent.text.strip("\t\r\n")

        datematch = re.search('(\d+)/(\d+)/(\d+)', authordate)
        date = datetime.datetime.strptime(datematch.group(),"%d/%m/%y")

        category = item.find("ul", class_="tag-default-news")   
        tags = []
        for tag in category.find_all("li"):
            tags.append(tag.string)
        alltags = ', '.join(tags)

        comments = item.footer.a['title']
        num = [ int(s) for s in str.split(comments) if s.isdigit() ]
        num_comments = int(num[0])

        #req = Request(link, None, headers)
        article_content = urlopen(link).read()
        content = BeautifulSoup(article_content)
        #newsbody = content.find("div", class_="post-body").text
        newsbody_html = content.find("div", class_="post-body")

        items.append(RSSItem(title=headline,
        description=newsbody_html,
        link=link,
        pubDate=date))
        #categories=tags))
    self.addRSSItems(items)


SoftonicFeed.load("Softonic News - John Doe",
         rooturl,
         "keep track of Johns articles as they're posted",
         'softonic.xml',
         'softonic.pickle',
         managingEditor='John Doe',
         webMaster='jify')