Skip to main content

Read a RSS using StAX for XML Parsing

This entry will introduce how to read a RSS by using StAX.

RSS: stand for Rich Site Summary. RSS document (called feed or web feed) include full or summarized text and metadata like publishing date, author's name,... RSS document is a XML file and is specified via RSS specification.

StAX: stand for Streaming API for XML, is API for XML processing.

* Create a Domain model to represent a Feed.

 Feed Example

<?xml version="1.0" encoding="UTF-8"?>
<?xml-stylesheet title="XSL_formatting" type="text/xsl" href="/shared/bsp/xsl/rss/nolsol.xsl"?>

<rss xmlns:media="http://search.yahoo.com/mrss/" xmlns:atom="http://www.w3.org/2005/Atom" version="2.0">  
  <channel> 
    <title>BBC News - Asia</title>  
    <link>http://www.bbc.co.uk/news/world/asia/#sa-ns_mchannel=rss&amp;ns_source=PublicRSS20-sa</link>  
    <description>The latest stories from the Asia section of the BBC News web site.</description>  
    <language>en-gb</language>  
    <lastBuildDate>Sat, 23 Nov 2013 08:47:43 GMT</lastBuildDate>  
    <copyright>Copyright: (C) British Broadcasting Corporation, see http://news.bbc.co.uk/2/hi/help/rss/4498287.stm for terms and conditions of reuse.</copyright>  
    <ttl>15</ttl>  
    <atom:link href="http://feeds.bbci.co.uk/news/world/asia/rss.xml" rel="self" type="application/rss+xml"/>  
    <item> 
      <title>China creates 'air-defence zone'</title>  
      <description>China demarcates an "air-defence identification zone" over the East China Sea, including islands that are also claimed by Japan.</description>  
      <link>http://www.bbc.co.uk/news/world-asia-25062525#sa-ns_mchannel=rss&amp;ns_source=PublicRSS20-sa</link>  
      <guid isPermaLink="false">http://www.bbc.co.uk/news/world-asia-25062525</guid>  
      <pubDate>Sat, 23 Nov 2013 09:02:26 GMT</pubDate>  
      <media:thumbnail width="66" height="49" url="http://news.bbcimg.co.uk/media/images/71298000/jpg/_71298080_e4739517-4b93-445e-ac3f-e84f805b54aa.jpg"/>  
      <media:thumbnail width="144" height="81" url="http://news.bbcimg.co.uk/media/images/71298000/jpg/_71298081_e4739517-4b93-445e-ac3f-e84f805b54aa.jpg"/> 
    </item>  
    <item> 
      <title>Deadly bomb blasts hit Karachi</title>  
      <description>At least seven people are reported to have been killed in two bomb explosions in a predominantly Shia area of Pakistan's southern city of Karachi.</description>  
      <link>http://www.bbc.co.uk/news/world-asia-25058015#sa-ns_mchannel=rss&amp;ns_source=PublicRSS20-sa</link>  
      <guid isPermaLink="false">http://www.bbc.co.uk/news/world-asia-25058015</guid>  
      <pubDate>Fri, 22 Nov 2013 22:02:47 GMT</pubDate>  
      <media:thumbnail width="66" height="49" url="http://news.bbcimg.co.uk/media/images/71297000/jpg/_71297424_71296480.jpg"/>  
      <media:thumbnail width="144" height="81" url="http://news.bbcimg.co.uk/media/images/71297000/jpg/_71297425_71296480.jpg"/> 
    </item>  
 </channel> 
</rss>

Feed item class: represent a item

package jbohn.xml.rss.model;

public class FeedItem 
{
 String title;
 String author;
 String description;
 String link;
 String guid;
 
 public String getTitle() {
  return title;
 }
 public void setTitle(String title) {
  this.title = title;
 }
 public String getAuthor() {
  return author;
 }
 public void setAuthor(String author) {
  this.author = author;
 }
 public String getDescription() {
  return description;
 }
 public void setDescription(String description) {
  this.description = description;
 }
 public String getLink() {
  return link;
 }
 public void setLink(String link) {
  this.link = link;
 }
 public String getGuid() {
  return guid;
 }
 public void setGuid(String guid) {
  this.guid = guid;
 }
 
 public String toString()
 {
  return "FeedMessage [title=" + title + ", description=" + description
          + ", link=" + link + ", author=" + author + ", guid=" + guid
          + "]";
 }
}


Feed class:

package jbohn.xml.rss.model;

import java.util.ArrayList;
import java.util.List;

public class Feed {
 final String title;
 final String link;
 final String description;
 final String language;
 final String copyright;
 final String pubDate;
 final List<FeedItem> entries = new ArrayList<FeedItem>();

 public Feed(String title, String link, String description, String language,
   String copyright, String pubDate) {
  this.title = title;
  this.link = link;
  this.description = description;
  this.language = language;
  this.copyright = copyright;
  this.pubDate = pubDate;
 }
 public List<FeedItem> getMessages() {
  return entries;
 }
 public String getTitle() {
  return title;
 }
 public String getLink() {
  return link;
 }
 public String getDescription() {
  return description;
 }
 public String getLanguage() {
  return language;
 }
 public String getCopyright() {
  return copyright;
 }
 public String getPubDate() {
  return pubDate;
 }
 @Override
 public String toString() {
  return "Feed [copyright=" + copyright + ", description=" + description
    + ", language=" + language + ", link=" + link + ", pubDate="
    + pubDate + ", title=" + title + "]";
 }
} 

RSSFeedParser class

package jbohn.xml.rss.read;

import java.io.IOException;
import java.io.InputStream;
import java.net.MalformedURLException;
import java.net.URL;

import javax.xml.stream.XMLEventReader;
import javax.xml.stream.XMLInputFactory;
import javax.xml.stream.XMLStreamException;
import javax.xml.stream.events.Characters;
import javax.xml.stream.events.XMLEvent;

import jbohn.xml.rss.model.Feed;
import jbohn.xml.rss.model.FeedItem;

public class RSSFeedParser 
{
 static final String TITLE = "title";
 static final String DESCRIPTION = "description";
 static final String CHANNEL = "channel";
 static final String LANGUAGE = "language";
 static final String COPYRIGHT = "copyright";
 static final String LINK = "link";
 static final String AUTHOR = "author";
 static final String ITEM = "item";
 static final String PUB_DATE = "pubDate";
 static final String GUID = "guid";
 
 final URL url;
 
 public RSSFeedParser(String feedURL)
 {
  try
  {
   this.url = new URL(feedURL);
  }
  catch(MalformedURLException e)
  {
   throw new RuntimeException(e);
  }
 }
 
 private InputStream read()
 {
  try{
   return url.openStream();
  }
  catch(IOException e)
  {
   throw new RuntimeException(e);
  }
 }
 
 private String getCharacterData(XMLEvent event, XMLEventReader eventReader)
   throws XMLStreamException {
  String result = "";
  event = eventReader.nextEvent();
  if ( event instanceof Characters )
  {
   result = event.asCharacters().getData();
  }
  return result;
 }
 
 public Feed readFeed()
 {
  Feed feed = null;
  try
  {
   boolean isFeedHeader = true;
   // Set header values intial to the empty string
   String description = "";
   String title = "";
   String link = "";
   String language = "";
   String copyright = "";
   String author = "";
   String pubdate = "";
   String guid = "";
   
   XMLInputFactory inputFactory = XMLInputFactory.newInstance();
   InputStream in = read();
   XMLEventReader eventReader = inputFactory.createXMLEventReader(in);
   
   while(eventReader.hasNext())
   {
    XMLEvent event = eventReader.nextEvent();
    if (event.isStartElement())
    {
     String localPart = event.asStartElement().getName().getLocalPart();
     switch (localPart) {
     case ITEM:
      if (isFeedHeader)
      {
       isFeedHeader = false;
       feed = new Feed(title, link, description, language, copyright, pubdate);
      }
      event = eventReader.nextEvent();
      break;
     case TITLE:
      title = getCharacterData(event, eventReader);
      break;
     case DESCRIPTION:
      description = getCharacterData(event, eventReader);
      break;
     case LINK:
      link = getCharacterData(event, eventReader);
      break;
     case GUID:
      guid = getCharacterData(event, eventReader);
      break;
     case LANGUAGE:
      language = getCharacterData(event, eventReader);
      break;
     case AUTHOR:
      author = getCharacterData(event, eventReader);
      break;
     case PUB_DATE:
      pubdate = getCharacterData(event, eventReader);
      break;
     case COPYRIGHT:
      copyright = getCharacterData(event, eventReader);
      break;
     default:
      break;
     }
    }
    else if (event.isEndElement())
    {
     if (ITEM.equals(event.asEndElement().getName().getLocalPart()))
     {
      FeedItem message = new FeedItem();
      message.setAuthor(author);
      message.setDescription(description);
      message.setGuid(guid);
      message.setLink(link);
      message.setTitle(title);
      feed.getMessages().add(message);
      event = eventReader.nextEvent();
      continue;
     }
    }
   }
   
  }
  catch(XMLStreamException e)
  {
   throw new RuntimeException(e);
  }
  return feed;
 }
}


Running:
package jbohn.xml.rss.main;

import jbohn.xml.rss.model.Feed;
import jbohn.xml.rss.model.FeedItem;
import jbohn.xml.rss.read.RSSFeedParser;

public class RSSMain {
 public static void main(String[] args) {
  RSSFeedParser parser = new RSSFeedParser(
    "http://feeds.bbci.co.uk/news/world/asia/rss.xml");
  Feed feed = parser.readFeed();
  System.out.println(feed);
  for (FeedItem message : feed.getMessages()) {
   System.out.println(message);

  }
 }
}



Summarized by jbohn.
Reference: http://www.vogella.com/articles/RSSFeed/article.html#rssoverview

Comments

  1. Does this make the difference between rss and atom?

    To me it looks like it would read
    [link]http://www.bbc.co.uk/news/world/asia/#sa-ns_mchannel=rss&ns_source=PublicRSS20-sa[/link]

    and then this link would be overridden by
    [atom:link href="http://feeds.bbci.co.uk/news/world/asia/rss.xml" rel="self" type="application/rss+xml"/]

    How do you handle that?

    ReplyDelete

Post a Comment

Popular posts from this blog

How to Install SQL Server on MacOS with docker

 I'm writing a small tut for who need to install SQL Server on macOS using docker Step 1: Download the SQL Server Image sudo docker pull mcr.microsoft.com/mssql/server:2019-latest Step 2: Launch the SQL Server Image in Docker docker run -d --name example_sql_server -e 'ACCEPT_EULA=Y' -e 'SA_PASSWORD=Pass.word-123' -p 1433:1433 mcr.microsoft.com/mssql/server:2019-latest Step 3: Check the SQL Server Docker Container docker ps -a Step 4: Install SQL Server Command-Line Tool sudo npm install -g sql-cli Step 5: Connect to SQL Server  5.1 Using Command mssql -u sa -p Pass.word-123 5.2: Using VSCode to connect to sql server Using the extension SQL Server (mssql)

Fast English Word Learning with Flashcard Generator

Introducing a tool that generates flashcards for preschoolers learning English. With just the words input, this tool creates visually appealing flashcards with buttons to hear the word and search related images using Bing. It's the perfect way to accelerate language learning for young children. Benefits: - Expand vocabulary quickly - Engage multiple senses for effective learning - Interactive and fun experience Try it: Flashcard Generator