from bs4 import BeautifulSoup #html text here html="" # Returns the direct children of the given node which ultimately lead to a node with valid text def getChildNodesWithValidText(bsnode,largestCommonNode): cwvt=[] if hasattr(bsnode, 'children'): for child in bsnode.children: if isNodeValidText(child) or len(getChildNodesWithValidText(child,largestCommonNode))>0: cwvt.append(child) if len(cwvt)>largestCommonNode[1]: largestCommonNode[1]=len(cwvt) largestCommonNode[0]=bsnode return cwvt def isNodeValidText(bsnode): return bsnode.name!='script' and bsnode.name!='style' and bsnode.string is not None and len(bsnode.string.split())>3 and len(bsnode.string)>6 def getLargestNode(bsOrigin): largestNode=[None,0] getChildNodesWithValidText(bsOrigin,largestNode) return largestNode soup = BeautifulSoup(html, 'html.parser') largestNode=utils.getLargestNode(soup.body)