fix: write CSV progressively
This commit is contained in:
parent
b0273c0cc3
commit
a2564fb602
@ -1,4 +1,6 @@
|
|||||||
from urllib.parse import urlparse, urlunparse, urljoin
|
from urllib.parse import urlparse, urlunparse, urljoin
|
||||||
|
import os
|
||||||
|
from csv import DictWriter
|
||||||
|
|
||||||
def to_absolute(url, host):
|
def to_absolute(url, host):
|
||||||
if url == '/': return host
|
if url == '/': return host
|
||||||
@ -14,4 +16,10 @@ def to_absolute(url, host):
|
|||||||
|
|
||||||
return p
|
return p
|
||||||
|
|
||||||
|
def write_results(main, data):
|
||||||
|
with open(os.path.join('results', main.netloc, 'texts.csv'), 'w') as f:
|
||||||
|
w = DictWriter(f, fieldnames=['page', 'tag', 'text', 'link', 'image'])
|
||||||
|
|
||||||
|
w.writeheader()
|
||||||
|
w.writerows(data)
|
||||||
|
|
||||||
|
13
index.py
13
index.py
@ -5,8 +5,7 @@ import os
|
|||||||
import shutil
|
import shutil
|
||||||
import argparse
|
import argparse
|
||||||
from urllib.parse import urlparse, urljoin
|
from urllib.parse import urlparse, urljoin
|
||||||
from csv import DictWriter
|
from helpers import to_absolute, write_results
|
||||||
from helpers import to_absolute
|
|
||||||
import time
|
import time
|
||||||
|
|
||||||
tags = ['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'ul', 'li', 'span', 'a', 'img']
|
tags = ['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'ul', 'li', 'span', 'a', 'img']
|
||||||
@ -82,6 +81,9 @@ for host in sites:
|
|||||||
p = to_absolute(href, host)
|
p = to_absolute(href, host)
|
||||||
if p and p.netloc == main.netloc:
|
if p and p.netloc == main.netloc:
|
||||||
queue.insert(0, p)
|
queue.insert(0, p)
|
||||||
|
|
||||||
|
|
||||||
|
write_results(main, data)
|
||||||
|
|
||||||
for link in queue:
|
for link in queue:
|
||||||
queue.remove(link)
|
queue.remove(link)
|
||||||
@ -90,10 +92,3 @@ for host in sites:
|
|||||||
scrape(link, depth=depth + 1)
|
scrape(link, depth=depth + 1)
|
||||||
|
|
||||||
scrape(main)
|
scrape(main)
|
||||||
|
|
||||||
with open(os.path.join('results', main.netloc, 'texts.csv'), 'w') as f:
|
|
||||||
w = DictWriter(f, fieldnames=['page', 'tag', 'text', 'link', 'image'])
|
|
||||||
|
|
||||||
w.writeheader()
|
|
||||||
w.writerows(data)
|
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user