Generic Scraper
2 months ago in Python
from bs4 import BeautifulSoup
#html text here
html=""

# Returns the direct children of the given node which ultimately lead to a node with valid text
def getChildNodesWithValidText(bsnode,largestCommonNode):
cwvt=[]
if hasattr(bsnode, 'children'):
for child in bsnode.children:
if isNodeValidText(child) or len(getChildNodesWithValidText(child,largestCommonNode))>0:
cwvt.append(child)
if len(cwvt)>largestCommonNode[1]:
largestCommonNode[1]=len(cwvt)
largestCommonNode[0]=bsnode
return cwvt
def isNodeValidText(bsnode):
return bsnode.name!='script' and bsnode.name!='style' and bsnode.string is not None and len(bsnode.string.split())>3 and len(bsnode.string)>6
def getLargestNode(bsOrigin):
largestNode=[None,0]
getChildNodesWithValidText(bsOrigin,largestNode)
return largestNode

soup = BeautifulSoup(html, 'html.parser')
largestNode=utils.getLargestNode(soup.body)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25