Sitemaps 协议使您能够告知搜索引擎您网站中可供抓取的网址。最简便的方式就是,使用 Sitemaps 协议的 Sitemaps 就是列有某个网站所有网址的 XML 文件。此协议可高度扩展,因此可适用于各种大小的网站。它还能够使网站管理员提供有关每个网址的其他信息(上次更新的时间、更改的频率、与网站中其他网址相比它的重要性)以便搜索引擎可以更智能地抓取该网站。
Sitemaps 在用户无法通过可浏览界面访问网站的所有区域时作用尤其明显。(通常,指用户无法通过追踪链接访问网站的特定页面或区域。)例如,那些只能通过搜索表单才能访问其中某些页面的网站都会从创建 Sitemaps 并将其提交到搜索引擎中获益。
我的这个BT网站(http://www.bt285.cn) 就是用java生成,主要就是读数据库的数据.与crawl相结合.
地址是http://www.bt285.cn/sitemaps.xml
附java代码:
batchSecondSiteMapCreate方法是:解析与生成第二层的sitemap.xml,因为sitemap有数量与大小的聘限制.所以像我这个BT网站内容有20多万条,肯定要分二层结构.
createFirstSiteMap方法是:解析与生成第一层的sitemap.xml,
public class CreateSiteMap {
private static final Log logger = LogFactory.getLog(CreateSiteMap.class);
private Set<FirstSiteMapModel> firstSiteMapSet = new HashSet<FirstSiteMapModel>();
private String serverPath;
private Document batchSecondSiteMapCreate(List<WNews> wNewsList,String secondSiteMapFile) {
Element rootNode = new Element("urlset");
rootNode.getAttributeValue("xmlns", "http://www.google.com/schemas/sitemap/0.84");
Document doc = new Document(rootNode);
List<Element> urlNodeList = new ArrayList<Element>();
for (int i = 0; i < wNewsList.size(); i++) {
WNews news = (WNews)wNewsList.get(i);
Element urlNode = new Element("url");
Element locNode = new Element("loc");
Element lastmodNode = new Element("lastmod");
Element changefreqNode = new Element("changefreq");
Element priorityNode = new Element("priority");
locNode.setText("http://www.bt285.cn/content.php?id=" +news.getId());
lastmodNode.setText(DateUtil.getDateStr(news.getUrlTime()));
changefreqNode.setText("yearly");
priorityNode.setText("0.5");
urlNode.addContent(locNode);
urlNode.addContent(lastmodNode);
urlNode.addContent(changefreqNode);
urlNode.addContent(priorityNode);
urlNodeList.add(urlNode);
}
rootNode.addContent(urlNodeList);
byte[] arr = XmlUtils.toByte(doc);
BufferedWriter filterBw;
try {
filterBw = new BufferedWriter(new FileWriter(
secondSiteMapFile, false));
filterBw.write(new String(arr));
filterBw.flush();
filterBw.close();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
return doc;
}
private void createFirstSiteMap() {
Element rootNode = new Element("sitemapindex");
Document doc = new Document(rootNode);
List<Element> sitemapNodeList = new ArrayList<Element>();
Object[] siteMaps = firstSiteMapSet.toArray();
for (int i = 0; i < siteMaps.length; i++) {
FirstSiteMapModel first = (FirstSiteMapModel)siteMaps[i];
Element urlNode = new Element("sitemap");
Element locNode = new Element("loc");
Element lastmodNode = new Element("lastmod");
locNode.setText(first.getLoc());
lastmodNode.setText(first.getLastmod());
urlNode.addContent(locNode);
urlNode.addContent(lastmodNode);
sitemapNodeList.add(urlNode);
}
rootNode.addContent(sitemapNodeList);
byte[] arr = XmlUtils.toByte(doc);
BufferedWriter filterBw;
try {
filterBw = new BufferedWriter(new FileWriter(getFirstSiteMapName(), false));
filterBw.write(new String(arr));
filterBw.flush();
filterBw.close();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
public void authSiteMap(String fileName) throws Exception{
Digester digester = new Digester();
digester.push(this);
digester.addCallMethod("urlset/url", "addSecondSiteMap", 4);
digester.addCallParam("urlset/url/loc", 0);
digester.addCallParam("urlset/url/lastmod", 1);
digester.addCallParam("urlset/url/changefreq", 2);
digester.addCallParam("urlset/url/priority", 3);
digester.parse(new File(fileName));
}
public void readFirstSiteMap() {
// Digester digester = DigesterLoader.createDigester( rules.toURL() );
Digester digester = new Digester();
digester.push(this);
digester.addCallMethod("sitemapindex/sitemap", "addFirstSiteMap", 2);
digester.addCallParam("sitemapindex/sitemap/loc", 0);
digester.addCallParam("sitemapindex/sitemap/lastmod", 1);
try {
digester.parse(new File(getServerPath() + "sitemaps.xml"));
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (SAXException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
public void addFirstSiteMap(String loc,String lastmod){
FirstSiteMapModel first = new FirstSiteMapModel();
first.setLoc(loc);
first.setLastmod(lastmod);
firstSiteMapSet.add(first);
}
/**//*
public void addSecondSiteMap(String loc,String lastmod,String changefreq,String priority){
SiteMapModel sitemap = new SiteMapModel();
sitemap.setLoc(loc);
sitemap.setLastmod(lastmod);
sitemap.setChangefreq(changefreq);
sitemap.setPriority(priority);
secondSiteMapSet.add(sitemap);
}*/
public String createSecondSiteMap(){
return DateUtil.getDateTimeWithyyyyMMddHHmm(new Date())+".xml";
}
public void execute(List<WNews> wNewsList){
setServerPath("/home/tomcat/webapps/bt285/");
//setServerPath("E:/work/study/res/");
setFirstSiteMapName(getServerPath() + "sitemaps.xml");
readFirstSiteMap();
Set<FirstSiteMapModel> firstSet = getFirstSiteMapSet();
String secondSiteMapName = createSecondSiteMap();
FirstSiteMapModel first = new FirstSiteMapModel();
first.setLoc("http://www.bt285.cn/sitemap/" +secondSiteMapName);
first.setLastmod(DateUtil.getDateStr(new Date()));
addFirstSiteMapElement(firstSet,first);
createFirstSiteMap();
//List<WNews> wNewsList = new ArrayList<WNews>();
String seoncdFileNamePath = getServerPath() + "sitemap/";
batchSecondSiteMapCreate(wNewsList,seoncdFileNamePath+secondSiteMapName);
}
public static void main(String[] args) throws Exception {
CreateSiteMap action = new CreateSiteMap();
List<WNews> wNewsList = new ArrayList<WNews>();
action.execute(wNewsList);
}
private String firstSiteMapName="";
public String getFirstSiteMapName() {
return firstSiteMapName;
}
public void setFirstSiteMapName(String firstSiteMapName) {
this.firstSiteMapName = firstSiteMapName;
}
public Set<FirstSiteMapModel> getFirstSiteMapSet() {
return firstSiteMapSet;
}
public Set<FirstSiteMapModel> addFirstSiteMapElement(
Set<FirstSiteMapModel> firstSiteMapSet,FirstSiteMapModel first) {
firstSiteMapSet.add(first);
return firstSiteMapSet;
}
public void setFirstSiteMapSet(Set<FirstSiteMapModel> firstSiteMapSet) {
this.firstSiteMapSet = firstSiteMapSet;
}
public String getServerPath() {
return serverPath;
}
public void setServerPath(String serverPath) {
this.serverPath = serverPath;
}
}