parse reserve bank blog

In [1]:
import requests
import xmltodict
import bs4
from bs4 import BeautifulSoup
import getpass
import nltk
In [2]:
myusr = getpass.getuser()
In [3]:
myreq = requests.get('http://www.rbnz.govt.nz/feeds/news')
In [4]:
xlpars = xmltodict.parse(myreq.text)
In [5]:
xlitem = xlpars['rss']['channel']['item']
In [6]:
blogtxt = list()
In [8]:
for xli in xlitem:
    tit = (xli['title'])
    titslug = tit.replace(' ', '-')
    myxl = requests.get(xli['link'])
    #myso = bs4.BeautifulSoup(myxl)
    putime = (xli['pubDate'])
    with open('/home/{}/artctrl/posts/{}.meta'.format(myusr, titslug[0:15]), 'w') as rbn:
        rbn.write('..title: {}\n.. slug: {}\n.. date: {}\n'.format(titslug, titslug, putime))
#    .. title: wer
#.. slug: wer
#.. date: 2017-07-30 00:56:50 UTC+12:00
#.. tags: 
#.. link: 
#.. description: 
#.. type: text
    #with open(newfile, 'w') as outfile
    soup = BeautifulSoup(myxl.text, 'html.parser')
    finpo = soup.find_all('p')
    finp = finpo[:2]
    with open('/home/{}/artctrl/posts/{}.rst'.format(myusr, titslug[0:15]), 'w') as rbn:
        rbn.write('{}\n\n{}\n\n{}'.format(titslug, xli['description'], finp))
    for finp in finpo[:2]:
        #with open('/home/{}/rbnz/posts/{}.rst'.format(myusr, titslug[0:15]), 'a') as rbn:
        #    rbn.write('{\n\n{}\n\n'.format(str(finp.text)))
    #refError = soup.findAll('span', { 'class': 'mw-ext-cite-error'})

The New Zealand Defence Force has signed a lease to occupy three floors in the Reserve Bank of New Zealand building, beginning later this year.
Reserve Bank Head of Currency, Property and Security, Steve Gordon, said that the Defence Force will be the fourth tenant in the building, joining the Parliamentary Counsel Office; Parliamentary Commissioner for the Environment; and the State Services Commission. The Reserve Bank has vacated the floors that are being leased to the Defence Force, as part of a strategy to increase its property income to meet its Funding Agreement. 
UnicodeEncodeError                        Traceback (most recent call last)
<ipython-input-8-a681473fa36f> in <module>()
     13     with open('/home/{}/artctrl/posts/{}.meta'.format(myusr, titslug[0:15]), 'w') as rbn:
     14         rbn.write('..title: {}\n.. slug: {}\n.. date: {}\n'.format(titslug, titslug, putime))
---> 15         rbn.write(xli['description'])
     17 #    .. title: wer

UnicodeEncodeError: 'ascii' codec can't encode character '\u2019' in position 16: ordinal not in range(128)
In [ ]:
mybltx = ' '.join(blogtxt)
In [ ]:
nltool = nltk.word_tokenize(mybltx)
In [ ]:
tagged = nltk.pos_tag(nltool)
In [1]:
word_tag_pairs = nltk.bigrams(nbrown_news_tagged)
noun_preceders = [a[1] for (a, b) in word_tag_pairs if b[1] == 'NOUN']
fdist = nltk.FreqDist(noun_preceders)
[tag for (tag, _) in fdist.most_common()]
NameError                                 Traceback (most recent call last)
<ipython-input-1-55f5a166a831> in <module>()
----> 1 word_tag_pairs = nltk.bigrams(nbrown_news_tagged)
      2 noun_preceders = [a[1] for (a, b) in word_tag_pairs if b[1] == 'NOUN']
      3 fdist = nltk.FreqDist(noun_preceders)
      4 [tag for (tag, _) in fdist.most_common()]

NameError: name 'nltk' is not defined


Comments powered by Disqus