数据采集技术概述

使用 apache httpd、filebeat、logstash、csv完成数据采集演示。

1、apache httpd

运行目录：/app/httpd/
日志目录：/app/httpd/logs/access_log
配置文件：/etc/httpd24/httpd.conf (更改端口)Listen
重启服务：/app/httpd/bin
apachectl restart

1）验证 httpd 运行在浏览器里访问：http://127.0.0.1

2、filebeat

2.1 目录信息

安装路径：/bigData/tools/filebeat/
配置文件：/bigData/tools/filebeat/filebeat.yml

1）修改配置文件

filebeat.inputs: 
- type: log 
  enabled: true
  paths:
    - /app/httpd/logs/access_log

output.logstash:
  hosts: ["localhost:5045"]

复制代码

2）启动：
2.1）进入filebeat目录

cd /bigData/tools/filebeat/

2.2）启动filebeat

filebeat -c /bigData/tools/filebeat/filebeat.yml

3、logstash

安装路径：/bigData/tools/logstash/
配置文件：/bigData/tools/logstash/config/logstash-filebeat-csv.conf

input { 
    beats {
         port => 5045
    }
}
filter {		  
     grok {
          match => { "message" => "%{HTTPD_COMMONLOG}" }
          #match => [ "message", "(?<ip>[:\d\.]+) - - \[(?<timestamp>[\S\s]+)\] (?<info>[\S\s]*)"]
     }
}
output {
     csv {
         path => "/home/output/httpd_file.csv"
         fields => ["clientip" ,"verb", "bytes"]
         #fields => ["ip", "timestamp"]
         csv_options => {"col_sep" => "    "}
     } 
     stdout{
        codec => rubydebug
    }
} 
 
复制代码

3.2 步骤

1）进入logstash bin目录

cd /bigData/tools/logstash/bin

2）启动logstash

logstash -f /bigData/tools/logstash/config/logstash-filebeat-csv.conf

根据logstash里的输出路径配置 /home/output/httpd_file.csv

爬虫

1.采集网站首页，去掉script去掉style, xpath提取左上角logo文字

#!/usr/bin/env python
# -*- coding: utf-8 -*-

import time
import urllib, time, os, base64, json
import re, sys
from lxml import etree
from urllib.request import urlopen
from urllib import request

def getPage(base_url):
    try:
        req = request.Request(base_url)

        page = urllib.request.urlopen(req)
        content = page.read().decode("utf-8")
        re_tag = re.compile('\<[\S\s]+?\>', re.I)
        re_cr = re.compile('\s{1,}', re.I)
        re_script = re.compile('\<script[\S\s]+?\</script\>', re.I)  # Script
        re_style = re.compile('\<style[\S\s]+?\</style\>', re.I)  # style
        content = re_script.sub('', content)  # 去掉SCRIPT  
        content = re_style.sub('', content)  # 去掉style  
        selector = etree.HTML(content)
        menu_items = selector.xpath("//*[@id=\"headContents\"]/div[1]/a")
        for item in menu_items:
            writefile("/home/output/crawler_result.csv", item.text)

    except Exception as e: 
        print("Failed to read from %s." % base_url)
        print(sys.exc_info())
        return False

def writefile(filename, content):
    try:
        fp = open(filename, 'a')
        fp.write(content + "\n")
        fp.close()  
    except:
        return False

now = time.strftime('%Y-%m-%d %X', time.localtime(time.time()))
try:
    url = 'http://117.73.9.229:9090/'
    getPage(url)

except Exception as e:
    info = '%s\nError: %s' % (now, e)
    writefile('Error.log', info)
    print(info)
    time.sleep(1)
复制代码

2.采集目标：获取新闻列表中至少三条新闻标题

import time
import urllib, time, os, base64, json
import re, sys
import urllib
from lxml import etree

import requests

def getPage(base_url):
    try:
        page = urllib.request.urlopen(base_url)  # 5
        content = page.read().decode("utf-8", "ignore").encode("utf-8",'ignore')  
        selector = etree.HTML(content)
menu_items = selector.xpath("/html/body/div[5]/div[1]/div[1]/div/div[2]/ol/li/h3")  # 5
    for item in menu_items:
        writefile("/home/output/crawler_result.csv", item.text)  # 2

except Exception as e:  # 3
    print("Failed to read from %s." % base_url)
    print(sys.exc_info())
    return False
def writefile(filename, content):
    try:
        fp = open(filename, 'a') # 5
        fp.write(content + "\n") # 5
        fp.close()  # 5
    except:
        return False

now = time.strftime('%Y-%m-%d %X', time.localtime(time.time()))

try:
    # 5
    url = 'http://117.73.9.229:9090/'
    getPage(url)

except Exception as e:
    info = '%s\nError: %s' % (now, e)
    writefile('Error.log', info)
    print (info)
    time.sleep(1)
复制代码

3.采集目标：网页head里title元素值

from  urllib.request import urlopen
import re
import  csv
url="http://117.73.9.229:9090/"
r=urlopen(url)
html=r.read().decode()
pattern=r"<title>(.+)</title>"
m=re.search(pattern,html)
if m is not None:
    result=m.group(1)
    with open("/home/output/crawler_result.csv","w",encoding="utf-8") as  f:
        writer=csv.writer(f)
        writer.writerow([result])
else:
    print("没找到")
复制代码

4.对于id值是”head_nav_list”的<ul>元素，获取其所有子元素<li>里的下级元素<a>中的href属性值

import time
import urllib, time, os, base64, json
import re, sys
import urllib
from lxml import etree

import requests

def getPage(base_url):
    try:
        page = urllib.request.urlopen(base_url)  # 5
        content = page.read().decode("utf-8", "ignore").encode("utf-8",'ignore')  
        selector = etree.HTML(content)
        # answer one
        # menu_items = selector.xpath("/html/body/header/div/ul[@id='head_nav_list']/li/a") # 5
        # for item in menu_items:
        #     writefile("/home/output/crawler_result.csv", item.attrib.get("href")) # 2
    # answer two
    menu_items = selector.xpath("/html/body/header/div/ul[@id='head_nav_list']/li/a/@href")  # 5
    for item in menu_items:
        writefile("/home/output/crawler_result.csv", item)  # 2

except Exception as e:  # 3
    print("Failed to read from %s." % base_url)
    print(sys.exc_info())
    return False
def writefile(filename, content):
    try:
        fp = open(filename, 'a') # 5
        fp.write(content + "\n") # 5
        fp.close()  # 5
    except:
        return False

now = time.strftime('%Y-%m-%d %X', time.localtime(time.time()))

try:
    # 5
    url = 'http://117.73.9.229:9090/'
    getPage(url)

except Exception as e:
    info = '%s\nError: %s' % (now, e)
    writefile('Error.log', info)
    print (info)
    time.sleep(1)
复制代码