So I have complained about the Google’s Spreadsheets API. After I crawled the news, I feel like I should do some text mining to break the sentences down and teach my algo to read the news. And I came across Google’s NLP API, Ha! I know I won’t like it but why not?

1. let’s create a google-nlp class

# Imports the Google Cloud client library
from google.cloud import language
from google.cloud.language import enums
from google.cloud.language import types

class nlp:
    def __init__(req, text):
        req.text = text
    
    def getEntity(req):
        try:
            text = req.text
            row = ''

            client = language.LanguageServiceClient()
            document = types.Document(
                content=text,
                type=enums.Document.Type.PLAIN_TEXT
                )
            
            encoding_type = enums.EncodingType.UTF8

            response = client.analyze_entities(document, encoding_type=encoding_type)

            for entity in response.entities:
                row = [entity.name, enums.Entity.Type(entity.type).name]
                #print(u"Representative name for the entity: {}".format(entity.name))
                #print(u"Entity type: {}".format(enums.Entity.Type(entity.type).name))
                return row

        except Exception as e:
            print(e)

response = client.analyze_entities(document, encoding_type=encoding_type) this python module is to create a client and analyse the entities. There are more features you can play with. The most useful one is tokenization. I will tell you later 😉.

2. let’s call the class while crawling the news

import feedparser
import os
import time
import mr_know_it_all.rss_feedparser
import mr_know_it_all.parse_html
import mr_know_it_all.google_nlp

def getstats(url):
    try:
        rss = mr_know_it_all.rss_feedparser.rss(url)
        result = rss.fetchDetails()

        return result      
    except Exception as e:
            print(e)

def main():
    try:
        url = 'http://feeds.reuters.com/reuters/UKTopNews'

        # url = 'https://feeds.a.dj.com/rss/RSSWorldNews.xml'

        result = getstats(url)
        title_init = result[0][0]
        latest_title = result[0][0]

        print("title_init:", title_init)

        for r in result:
            print(r[0])
            html = mr_know_it_all.parse_html.html_class(r[1])
            text = html.parsedHTML()
            print(text)

            for line in text:
                entity = mr_know_it_all.google_nlp.nlp(line)
                result = entity.getEntity()
                print(u"{0}:{1}".format(result[0], result[1]))

        i = 0
        while latest_title == title_init :
            i += 1 
            print(i, "no new feeds yet, sleep 60 seconds")
            time.sleep(60)
            result = getstats(url)
            latest_title = result[0][0]

            if latest_title != title_init :
                print("new news:")
                i = 0 
                result = getstats(url)
                latest_title = result[0][0]
                title_init = result[0][0]
                for r in result:
                    print(r[0])
                    html = mr_know_it_all.parse_html.html_class(r[1])
                    text = html.parsedHTML()
                    print(text)
                    for line in text:
                        entity = mr_know_it_all.google_nlp.nlp(line)
                        result = entity.getEntity()
                        print(u"{0}:{1}".format(result[0], result[1]))
            else:
                continue

    except Exception as e:
            print(e)

if __name__ == '__main__':
    main()

p.s. you will need to handle the authentication with a google service account

Nah, Just joking. I will tell you later in the following-up post about how to use grammar with the tokenization, where we can find more interesting insights from those news and text. To teach your computer to read news exactly the way you programmed 😎😎😎). Please feel free to 👉📱message my twilio bot (+447479275693).

Buy me a coffeeBuy me a coffee