Generic Scraper

2 years ago in Python

                            from bs4 import BeautifulSoup
#html text here
html=""
# Returns the direct children of the given node which ultimately lead to a node with valid text
def getChildNodesWithValidText(bsnode,largestCommonNode):
    cwvt=[]
    if hasattr(bsnode, 'children'):
        for child in bsnode.children:
            if isNodeValidText(child) or len(getChildNodesWithValidText(child,largestCommonNode))>0:
                cwvt.append(child)
        if len(cwvt)>largestCommonNode[1]:
            largestCommonNode[1]=len(cwvt)
            largestCommonNode[0]=bsnode
    return cwvt
def isNodeValidText(bsnode):
    return bsnode.name!='script' and bsnode.name!='style' and bsnode.string is not None and len(bsnode.string.split())>3 and len(bsnode.string)>6
def getLargestNode(bsOrigin):
    largestNode=[None,0]
    getChildNodesWithValidText(bsOrigin,largestNode)
    return largestNode
soup = BeautifulSoup(html, 'html.parser')
largestNode=utils.getLargestNode(soup.body)

                        

Recent Public Pastes

PRIVATE VIDEO COLLECTION HD WEBCAM JB VIDS MEGA LINKS - 20 seconds ago PRIVATE VIDEO COLLECTION HD WEBCAM JB VIDS MEGA LINKS - 6 hours ago PRIVATE VIDEO COLLECTION HD WEBCAM JB VIDS MEGA LINKS - 13 hours ago Untitled - 3 days ago bootp - 5 days ago Untitled - 5 days ago Untitled - 5 days ago Untitled - 5 days ago Untitled - 6 days ago Subscription - 6 days ago