education-counts-url

In [60]:
import bs4
import requests
import dominate
from dominate.tags import *
import os
import shutil
import json
from urlparse import urlparse
from bs4 import BeautifulSoup
In [61]:
#eceall = requests.get('')
In [62]:
opind = open('/media/removable/WILLIAMS30G/www.educationcounts.govt.nz/index.html', 'r')
In [63]:
optfz = opind.read()
In [64]:
edcountz = bs4.BeautifulSoup(optfz)
In [65]:
afind = edcountz.findAll('a')
In [66]:
gidict = dict()
In [67]:
for afi in afind:
    gied = afi.text
    gifz = gied.replace(' ', '-')
    giflow = gifz.lower()
    gidict.update({giflow[:12] : afi.attrs['href']})
    
    gihrd = urlparse(afi.attrs['href'])
    
    
In [68]:
doc = dominate.document(title='educount')

with doc.head:
    link(rel='stylesheet', href='style.css')
    script(type='text/javascript', src='script.js')

with doc:
    #with div(id='header').add(ol()):
        #for i in ['home', 'about', 'contact']:
            #li(a(i.title(), href='/%s.html' % i))

    with div(cls='row'):
        h1('education-counts-url')
        for afi in afind:
            if 'https://' in afi.attrs['href']:
                gied = afi.text
                gifz = gied.replace(' ', '-')
                giflow = gifz.lower()
                a(dominate.tags.li(giflow[:12]), href = (afi.attrs['href']))
                #print afi.attrs['href']
            #p(edu.txt)
            #dominate.tags.p(edu.attrs['href'])
            #a(dominate.tags.p(edu.text), href=dominate.tags.a(edu.attrs['href']))
            
            #print edu.text

#print doc
In [69]:
docre = doc.render()
#s = docre.decode('ascii', 'ignore')
yourstring = docre.encode('ascii', 'ignore').decode('ascii')
indfil = ('/media/removable/WILLIAMS30G/education-counts-url/index.json')
mkind = open(indfil, 'w')
mkind.write(yourstring)
mkind.close()
In [70]:
dicon = json.dumps(gidict)
In [71]:
opeducon = open('/media/removable/WILLIAMS30G/education-counts-url/index.json', 'w')

opeducon.write(dicon)

opeducon.close()
In [72]:
rdeducon = open('/media/removable/WILLIAMS30G/education-counts-url/index.json', 'r')

rdqa = rdeducon.read()
In [73]:
loadic = json.loads(rdqa)

Comments

Comments powered by Disqus