Javascript Feed Parser

2010-03-22

Here's a simple XML Feed parser in Javascript. It uses the native support of XML parsing in browsers (remember the X in Ajax?). This script is easy to setup and can be modified to your needs very quickly.

With the emerging cross domain data messages (or by cheating with YQL...) and the native XML parsing capabilities of the browser, there's little more to parsing feeds. Easy as pie!

This script will try to parse the Feed and return all the elements it can find as Item objects. These contain the title, description and link of the item (found in all types of feed supported). Additionally, the XML subtree of the item is also saved.

The script also supports plugins which allow you to apply plugins to a specific feed or a group of feeds. The plugin is set up beforehand (like a callback) and is executed when the specified name of a feed or group is parsed. The plugin has access to the XML subtree and is able to extract alternative data from the feed.

The idea is to normalize the data into these Item objects and handle them generically by some other script. That way you don't have to worry about what type of feed you're going to parse. The results are always the same.

The script currently supports RSS 0.91, 1.00 and 2.00 and Atom. Other versions should be easily hackable, just take one of the existing functions as an example and build from there :)

The plugin callbacks get called with one parameter; an array with Item objects. Each Item is a simple (extendible) object with five properties; title, description, link, date and xml. You can manipulate the array. Whatever the plugin returns, will be (eventually) returned by Feed.parse. This includes deleting or adding items to the array.

You start the feed by calling Feed.parse(url, name, group); and, on success, it will return an array containing the Item objects. One Item for each returned by the feed. This array will have been processed by the plugins, when applicable.

There isn't much else to say. It's a simple straightforward feed parsing script :) It's about 8.5k raw and about 3.5k minified. Hope it helps you :)

Code: (JS)
/**
* Feed parser
* Parses raw XML feeds and converts them to so called Item objects (see below).
* By qFox, 2010, http://qfox.nl
*/

var Feed = {
/**
* Detect the type of the feed and let type specific functions
* parse the feed. The result is an array containing Element
* objects representing the items from the feed.
* @param XML xml The actual feed, as an XML tree
* @param string name Name of the feed, passed on to plugins
* @param string group Name of group of the feed, passed on to plugins
* @return array Contains Item objects
*/
parse: function(xml, name, group){
var root, result;

// rss 1.0 ("rdf")
if (xml.getElementsByTagName('rdf:RDF').length || xml.getElementsByTagName('RDF').length) {
return Feed.parseRss1(xml, name, group);
}

// rss (2.0)
if ((root = xml.getElementsByTagName('rss')) && root.length) { // RSS feed
if (root[0].getAttribute('version') === '2.0') { // rss 2.0
return Feed.parseRss2(root[0], name, group);
}
if (root[0].getAttribute('version') === '0.91') { // rss 0.91
return Feed.parseRss091(root[0], name, group);
}
debug(" unknown rss version...");
}

// atom
if (xml.getElementsByTagName('feed').length) {
return Feed.parseAtom(xml, name, group);
}

debug("unsupported feed");
return false;
},

/**
* Retrieve the node value for given attribute or an empty string on failure.
* When the third parameter is given, it returns that attribute value of the node.
* @param xml root The root node to search through
* @param string name The node name we're looking for
* @param string atr=false If given, the attribute of node we want returned
* @return mixed
*/
getNodeValue: function(root, name, atr){
var node;
try {
node = root.getElementsByTagName(name)[0];
if (atr) return node.getAttribute(atr);

return node.childNodes[0].nodeValue;
} catch(er) {
return '';
}
},

/**
* Parse a RSS 1.0 feed
* Returns an array with Element objects.
*
* @param document xmlRoot
* @param string name Name of the feed we're fetching, passed on to plugins
* @param string group Name of the group this feed belongs to, passed on to plugins
* @return array
*/
parseRss1: function(xmlRoot, name, group){
var result = [],
items,
item,
i;

items = xmlRoot.getElementsByTagName('item');
for (i=0; i item = items[i ];
//debug("Parsing item "+i+" ("+item+")");
// title, link, description dc:creator, dc:date, dc:subject
try {
result[result.length] = new Element(
Feed.getNodeValue(item, 'title'),
Feed.getNodeValue(item, 'description'),
Feed.getNodeValue(item, 'link'),
Feed.getNodeValue(item, 'dc:date') || Feed.getNodeValue(item, 'pubDate') || Feed.getNodeValue(item, 'date') || '',
item
);
} catch (er) {
debug("Unable to parse item "+i+": "+er.message);
}
}
// apply plugins
Feed.plug(result, name, group);
// return the items
return result;
},

/**
* Parse an RSS 2.0 feed
* Returns an array containing Element objects.
*
* @param document xmlRoot
* @param string name Name of the feed we're fetching, passed on to plugins
* @param string group Name of the group this feed belongs to, passed on to plugins
* @return array
*/
parseRss2: function(xmlRoot, name, group){
var i,
result = [],
item, // one
items = xmlRoot.getElementsByTagName('item'); // collection of nodes

for (i=0; i item = items[i ];
// now add the element
try {
result[result.length] = new Element(
Feed.getNodeValue(item, 'title'),
Feed.getNodeValue(item, 'description'),
Feed.getNodeValue(item, 'link'),
Feed.getNodeValue(item, 'pubDate') || Feed.getNodeValue(item, 'dc:date') || Feed.getNodeValue(item, 'date') || '',
item
);
} catch(er) {
debug("Feed.parseRss2 fail for "+i+" "+j+" ("+er.message+")");
}
}

// apply plugins
Feed.plug(result, name, group);

// return the elements
return result;
},

/**
* Parse a RSS 0.91 feed
* Returns an array with Element objects
*
* @param document xmlRoot
* @param string name Name of the feed we're fetching, passed on to plugins
* @param string group Name of the group this feed belongs to, passed on to plugins
* @return array
*/
parseRss091: function(xmlRoot, name, group){
var i,
result = [],
item, // single element
items = xmlRoot.getElementsByTagName('item'); // get items for this feed

for (i=0; i item = items[i ];
// now add the element
try {
result[result.length] = new Element(
Feed.getNodeValue(item, 'title'),
Feed.getNodeValue(item, 'description'),
Feed.getNodeValue(item, 'link'),
Feed.getNodeValue(item, 'pubDate') || Feed.getNodeValue(item, 'dc:date') || Feed.getNodeValue(item, 'date') || '',
item
);
} catch(er) {
debug("Feed.parseRss2 fail for "+i+" ("+er.message+")");
}
}

// apply plugins
Feed.plug(result, name, group);

// return the items
return result;
},

/**
* Parse an Atom feed
* Returns an array with Element objects.
*
* @param document xmlRoot
* @param string name Name of the feed we're fetching, passed on to plugins
* @param string group Name of the group this feed belongs to, passed on to plugins
* @return array
*/
parseAtom: function(xmlRoot, name, group){
var result = [],
i,
item, // one element
items = xmlRoot.getElementsByTagName('entry');

for (i=0; i item = items[i ];
// title, link, summary, published
try {
result[result.length] = new Element(
Feed.getNodeValue(item, 'title'),
Feed.getNodeValue(item, 'summary'),
Feed.getNodeValue(item, 'link', 'href'),
Feed.getNodeValue(item, 'published') || Feed.getNodeValue(item, 'updated') || '',
item
);
} catch (er) {
debug("Unable to parse item "+i+": "+er.message);
}
}

// apply plugins
Feed.plug(result, name, group);

// return the items
return result;
},

/**
* Add a plugin for a specific name or group.
* The plugin only applies to those names or groups.
* This is like an addListener or attachListener function.
* @param string name=false Name of feed
* @param string group=false Group name that feed belongs to
* @param function callback Plugin, called with callback(arr, group)
*/
plugin: function(name, group, callback){
var p = Feed.objPlugins || (Feed.objPlugins = {});
if (group) {
if (!p[group]) p[group] = [];
p[group][p[group].length] = callback;
}
if (name) {
if (!p[name]) p[name] = [];
p[name][p[name].length] = callback;
}
},

/**
* Actually apply plugins by feed name or group. Supply at least one.
* @param array arr An array with Item objects. Plugins should manipulate this array by reference as nothing is returned.
* @param string name=false Only apply to feeds by this name
* @param string group=false Only apply to feeds in this group name
*/
plug: function(arr, name, group){
var p = Feed.objPlugins;
// if no plugins are registered, just return
if (!p) return;
// run all plugins by name
if (name && p[name] && p[name].length) {
for (var i=0; i p[name][i ](arr);
}
}
// run all plugins by group
if (group && p[group] && p[group].length) {
for (var i=0; i p[group][i ](arr);
}
}
},

};

var Item = function(title, description, url, date, xml){
this.title = title; // string
this.description = description; // string (not sanatized)
this.link = url; // string
this.date = date; // timestamp (as found in the feed...)
this.xml = xml; // The xml subtree from which the information from this tree came. Plugins can get additional non-standard stuff from it.
};