开发可视化爬虫模板编辑器

  • 时间:2014-05-07
  • 方式:原创

很久以前借助chrome做过一个可视化的爬虫模板编辑工具。刚好一朋友有相同的需求。就简单整理了下。

文章中不是完全的代码。只是核心的一些思路。整体来说也是很简单的。

开发语言

开发语言就是Javascript,思路是在chrome上扩展一个插件.这样我们就可以可视化的获取到Xpath了.

而Xpath可以作为爬虫的获取信息的模板使用,可视化的XPath工具方便性和准确性都比正则高不少

开发软件选择一款自己熟悉的纯文本编辑器就可以了,推荐Sublime,支持函数跳转 提示 outline等 比较方便。

manifest.json

入口文件 配置插件的基本属性

配置如下

{
  "name": "页面配置",
  "description": "快速的配置网页...",
  "version": "0.1",

  "permissions": ["contextMenus"],
  "background": {
    "persistent": false,
    "default_icon": "empty.png",
    "scripts": ["configXPath.js"]
  },

    "content_scripts": [
    {
      "matches": [
        "http://*/*",
        "https://*/*"
      ],
      "all_frames": true,
      "js": [
        "listener.js"
      ]
    }
  ],

  "manifest_version": 2
}

configXPath.js

定义右键的菜单

function createInitMenus(id){
    chrome.contextMenus.create({"title": "获取Xpath", "contexts":["all"],"id": "configXpath"});
}

function onClickHandler(info, tab) {
    if(info.menuItemId == "configXpath"){
        chrome.tabs.sendMessage(tab.id, {method: "getXPath"});
    }
}

chrome.contextMenus.onClicked.addListener(onClickHandler);
    chrome.runtime.onInstalled.addListener(function() {
    createInitMenus();
});

listener.js

String.prototype.toPer = function()
{
    return this.replace(/([\\s]*)/g, "");
}


//返回选中的元素
function getSelectObj(request, sender, sendResponse){
    var selection = window.getSelection();
    var str = selection.toString();
    str = str.replace(new RegExp("\\s","gm"),"");

    if (selection.rangeCount > 0){
        var temp = selection.focusNode.parentNode;

        var tempStr = temp.textContent;
        tempStr = tempStr.replace(new RegExp("\\s","gm"),"");
        //console.log("Python·G·Venture ---- "+tempStr);
        while(temp!=null && temp!="" && typeof(temp)!="undefined" && tempStr!=null && tempStr.indexOf(str) < 0){
            temp = temp.parentNode;
            tempStr = temp.textContent;
            tempStr = tempStr.replace(new RegExp("\\s","gm"),"");
            //console.log(tempStr.indexOf(str)+"  Python·G·Venture ---- "+tempStr);
        }

        var curpath = "";
        while(temp!=null && temp!="" && typeof(temp)!="undefined"){
            try{
                if(temp.nodeName.toLowerCase()=="body") break;

                var id = "";
                var name = "";
                var className = "";
                id = temp.getAttribute("id");
                name = temp.getAttribute("name");
                className = temp.getAttribute("class");

                var isadd = false;
                var tempResult = temp.nodeName;
                if(id!="" && id!=null && id.length<15){
                    tempResult+=".id="+id;
                    isadd = true;
                }
                if(!isadd && name!="" && name!=null && name.length<15){
                    tempResult+=".name="+name;
                    isadd = true;
                }
                if(!isadd && className!="" && className!=null && className.length<15){
                    tempResult+=".class="+className;
                }

                if(curpath!=""){
                    curpath = tempResult+">>"+curpath;
                }else{
                    curpath = tempResult;
                }

                temp = temp.parentNode;
            }catch(e){break;}
        }
        curpath = curpath.toLowerCase();

        return curpath;
    }else{
        return "";
    }
}


//下载文件
function writeFile(templateinfo){
    if(typeof(templateinfo)=="undefined") {
        alert("配置有误!");
        return;
    }

    if(typeof(templateinfo.host)=="undefined") return;
    console.log("Python·G·Venture ---- "+templateinfo.host+"\n"+templateinfo.exportTemplate());

    var textToWrite = templateinfo.exportTemplate();
    var textFileAsBlob = new Blob([textToWrite], {type:'text/plain'});
    var downloadLink = document.createElement("a");
    downloadLink.href = window.webkitURL.createObjectURL(textFileAsBlob);
    downloadLink.download = (templateinfo.host+".template");

    var ev = document.createEvent('MouseEvents');
    ev.initEvent('click', false, true);
    downloadLink.dispatchEvent(ev);
}


function XpathData() {
    this.host;
    this.xpath;

    this.exportTemplate = function() {
        return "xpath:"+this.xpath+"\n";
    };
}

var xpathinfo;
chrome.extension.onMessage.addListener(function(request, sender, sendResponse) {
    if (request.method == "getXPath"){
        var str = getSelectObj(request, sender, sendResponse);
        if(str=="") return;

        xpathinfo.host = window.location.host;
        xpathinfo.xpath = str;
        writeFile(xpathinfo);
    }
});