数据采集技术概述
使用 apache httpd、filebeat、logstash、csv完成数据采集演示。
1、apache httpd
运行目录:/app/httpd/
日志目录:/app/httpd/logs/access_log
配置文件:/etc/httpd24/httpd.conf (更改端口)Listen
重启服务:/app/httpd/bin
apachectl restart
1) 验证 httpd 运行在浏览器里访问:http://127.0.0.1
2、filebeat
2.1 目录信息
安装路径:/bigData/tools/filebeat/
配置文件:/bigData/tools/filebeat/filebeat.yml
1)修改配置文件
filebeat.inputs:
- type: log
enabled: true
paths:
- /app/httpd/logs/access_log
output.logstash:
hosts: ["localhost:5045"]
复制代码
2)启动:
2.1)进入filebeat目录
cd /bigData/tools/filebeat/
2.2)启动filebeat
filebeat -c /bigData/tools/filebeat/filebeat.yml
3、logstash
安装路径:/bigData/tools/logstash/
配置文件:/bigData/tools/logstash/config/logstash-filebeat-csv.conf
input {
beats {
port => 5045
}
}
filter {
grok {
match => { "message" => "%{HTTPD_COMMONLOG}" }
#match => [ "message", "(?<ip>[:\d\.]+) - - \[(?<timestamp>[\S\s]+)\] (?<info>[\S\s]*)"]
}
}
output {
csv {
path => "/home/output/httpd_file.csv"
fields => ["clientip" ,"verb", "bytes"]
#fields => ["ip", "timestamp"]
csv_options => {"col_sep" => " "}
}
stdout{
codec => rubydebug
}
}
复制代码
3.2 步骤
1)进入logstash bin目录
cd /bigData/tools/logstash/bin
2)启动logstash
logstash -f /bigData/tools/logstash/config/logstash-filebeat-csv.conf
根据logstash里的输出路径配置 /home/output/httpd_file.csv
爬虫
1.采集网站首页,去掉script去掉style, xpath提取左上角logo文字
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import time
import urllib, time, os, base64, json
import re, sys
from lxml import etree
from urllib.request import urlopen
from urllib import request
def getPage(base_url):
try:
req = request.Request(base_url)
page = urllib.request.urlopen(req)
content = page.read().decode("utf-8")
re_tag = re.compile('\<[\S\s]+?\>', re.I)
re_cr = re.compile('\s{1,}', re.I)
re_script = re.compile('\<script[\S\s]+?\</script\>', re.I) # Script
re_style = re.compile('\<style[\S\s]+?\</style\>', re.I) # style
content = re_script.sub('', content) # 去掉SCRIPT
content = re_style.sub('', content) # 去掉style
selector = etree.HTML(content)
menu_items = selector.xpath("//*[@id=\"headContents\"]/div[1]/a")
for item in menu_items:
writefile("/home/output/crawler_result.csv", item.text)
except Exception as e:
print("Failed to read from %s." % base_url)
print(sys.exc_info())
return False
def writefile(filename, content):
try:
fp = open(filename, 'a')
fp.write(content + "\n")
fp.close()
except:
return False
now = time.strftime('%Y-%m-%d %X', time.localtime(time.time()))
try:
url = 'http://117.73.9.229:9090/'
getPage(url)
except Exception as e:
info = '%s\nError: %s' % (now, e)
writefile('Error.log', info)
print(info)
time.sleep(1)
复制代码
2.采集目标:获取新闻列表中至少三条新闻标题
import time
import urllib, time, os, base64, json
import re, sys
import urllib
from lxml import etree
import requests
def getPage(base_url):
try:
page = urllib.request.urlopen(base_url) # 5
content = page.read().decode("utf-8", "ignore").encode("utf-8",'ignore')
selector = etree.HTML(content)
menu_items = selector.xpath("/html/body/div[5]/div[1]/div[1]/div/div[2]/ol/li/h3") # 5
for item in menu_items:
writefile("/home/output/crawler_result.csv", item.text) # 2
except Exception as e: # 3
print("Failed to read from %s." % base_url)
print(sys.exc_info())
return False
def writefile(filename, content):
try:
fp = open(filename, 'a') # 5
fp.write(content + "\n") # 5
fp.close() # 5
except:
return False
now = time.strftime('%Y-%m-%d %X', time.localtime(time.time()))
try:
# 5
url = 'http://117.73.9.229:9090/'
getPage(url)
except Exception as e:
info = '%s\nError: %s' % (now, e)
writefile('Error.log', info)
print (info)
time.sleep(1)
复制代码
3.采集目标:网页head里title元素值
from urllib.request import urlopen
import re
import csv
url="http://117.73.9.229:9090/"
r=urlopen(url)
html=r.read().decode()
pattern=r"<title>(.+)</title>"
m=re.search(pattern,html)
if m is not None:
result=m.group(1)
with open("/home/output/crawler_result.csv","w",encoding="utf-8") as f:
writer=csv.writer(f)
writer.writerow([result])
else:
print("没找到")
复制代码
4.对于id值是”head_nav_list”的<ul>元素,获取其所有子元素<li>里的下级元素<a>中的href属性值
import time
import urllib, time, os, base64, json
import re, sys
import urllib
from lxml import etree
import requests
def getPage(base_url):
try:
page = urllib.request.urlopen(base_url) # 5
content = page.read().decode("utf-8", "ignore").encode("utf-8",'ignore')
selector = etree.HTML(content)
# answer one
# menu_items = selector.xpath("/html/body/header/div/ul[@id='head_nav_list']/li/a") # 5
# for item in menu_items:
# writefile("/home/output/crawler_result.csv", item.attrib.get("href")) # 2
# answer two
menu_items = selector.xpath("/html/body/header/div/ul[@id='head_nav_list']/li/a/@href") # 5
for item in menu_items:
writefile("/home/output/crawler_result.csv", item) # 2
except Exception as e: # 3
print("Failed to read from %s." % base_url)
print(sys.exc_info())
return False
def writefile(filename, content):
try:
fp = open(filename, 'a') # 5
fp.write(content + "\n") # 5
fp.close() # 5
except:
return False
now = time.strftime('%Y-%m-%d %X', time.localtime(time.time()))
try:
# 5
url = 'http://117.73.9.229:9090/'
getPage(url)
except Exception as e:
info = '%s\nError: %s' % (now, e)
writefile('Error.log', info)
print (info)
time.sleep(1)
复制代码