开发,设计,生活

网络爬虫系统

  • 2014-05-29 15:13:01 2014-06-18 08:47:17[U] 网络爬虫系统

    51job 工作信息获取

    #!/usr/bin/env python
    # coding: utf-8
    from scrapy.spider import Spider
    from scrapy.selector import Selector
    from scrapy.http import Request
    
    class JobSpider(Spider):
            name = "job"
            allowed_domains = ["51job.com"]
            # 51job 查询上海的 python scrapy 工作
            start_urls = (
                            'http://search.51job.com/jobsearch/search_result.php?fromJs=1&jobarea=020000&funtype=0000&industrytype=00&keyword=scrapy%20python&keywordtype=2&lang=c&stype=1&postchannel=0000&fromType=1',
                            )
    
            def parse(self, response):
                    '''
                    一级页面解析,提取详细信息页面的链接,并创建详细页面的请求对象
                    '''
                    sel = Selector(response)
                    jobs = sel.xpath('//div[@class="resultListDiv"]//td[@class="td1"]/a/@href').extract()
                    jobReqs = []
                    for job in jobs:
                            # 创建二级页面请求,添加回调函数
                            req = Request(job, self.parseJobDetail)
                            jobReqs.append(req)
                    return jobReqs
    
            def parseJobDetail(self, response):
                    '''
                    二级页面解析,提取详细内容
                    '''
                    sel = Selector(response)
                    details = sel.xpath('//td[@class="job_detail"]//text()').extract()
                    content = '\n'.join(details)
                    print content