PowerShell: Find a Downloadable File from a Web Page

Although the function below is adequate for simple purposes. It lacks the advanced algorithm to craw websites. I’ll loopback to rewrite this thing when there’s a necessity for better tools.

# findDownloadUrl_v0.0.2.ps1
# This little program will parse a web page for it's downloadable contents
# The multi-threading code has been added

$startUrl='https :// somewebsite'
$fileExtension='.exe'
$maxDepth=3

function findDownloadUrl{
    param(
        $startUrl,
        $fileExtension,
        $maxDepth=3
    )
    $simultaneousJobs=8
    $linksChecked=0
    $firstResult=$false
    $timer=[System.Diagnostics.Stopwatch]::StartNew()
    if(!$startUrl){
        write-warning "Cannot start with a blank parent URL"
    }elseif($startUrl -notmatch '/$'){
        $startUrl=$startUrl+'/'
        }

    function findFile($parentUrl,$extension){
        $ProgressPreference='SilentlyContinue'
        $ErrorActionPreference='stop'
        if($parentUrl -notmatch '/$'){$parentUrl=$parentUrl+'/'}
        try{
            $page=Invoke-WebRequest $parentUrl -TimeoutSec 10
        }catch{
            return @{'result'=$false;'links'=@()}
            }
        $newLinks=$page.links.href|?{$_ -notlike "*$(Split-Path $parentUrl -parent)"}| `
            sort -Descending|%{$(
                                if($_[0] -eq '/'){
                                    $parentUrl+$_.Substring(1,$_.length-1)
                                }elseif($_ -match '^http'){
                                    $_
                                }else{
                                    $parentUrl+$_
                                }
                            )}|select -Unique
        $matchedExtension=$newLinks|?{$_ -like "*$extension"}|sort -Descending|select -First 1
        if($matchedExtension){
            return @{'result'=$true;'links'=$matchedExtension}
        }elseif($newLinks){
            return @{'result'=$false;'links'=$newLinks}
        }else{
            return @{'result'=$false;'links'=@()}
            } 
    }  

    write-host "Scanning $startUrl for file extension $fileExtension"
    $startLinks=.{$result=findFile $startUrl $fileExtension
                    return $result['links']
                    }
    if($startLinks -eq [string]){
        return $startLinks
        }
    $knownLinks=$startLinks

    foreach ($link in $startLinks){       
        $currentDepth=1
        write-host "Processing link at current depth: $currentDepth"
        $newLinks=@($link) 
        do{ 
            if($i++ -lt $simultaneousJobs -and !(!$newLinks)){
                $thisLink=$newLinks|Select -Unique|select -First 1
                if($newLinks.count -gt 1){
                    $newLinks=$newLinks[1..($newLinks.count-1)]
                }else{
                    $newLinks=@()
                    }
                write-host "Parsing $thisLink"
                $job=start-job -ScriptBlock{
                    param($findFile,$thisLink,$fileExtension)
                    return [ScriptBlock]::Create($findFile).invoke($thisLink,$fileExtension)
                    } -Args ${function:findFile},$thisLink,$fileExtension
                $linksChecked++
            }else{
                do{
                    $results=Get-Job|Receive-Job -wait
                    get-job -State 'Completed'|remove-job                    
                    $results|%{
                        $currentDepth++
                        if($_['result']){
                            write-host "Bingo!" -ForegroundColor Green
                            get-job|remove-job
                            $firstResult=$_['links']
                        }elseif($currentDepth -le $maxDepth){
                            $addLinks=$_['links']|?{$_ -notin $knownLinks}
                            if($addLinks){
                                write-host "Adding new links to depth $currentDepth`:`r`n$(($addLinks|out-string).trim())"
                                $knownLinks+=$addLinks
                                $newLinks=$addLinks+$newLinks
                                }
                            }
                        }
                    $i=(get-job -state 'Running').count
                    }until($i -lt $simultaneousJobs -or $firstResult) 
                }
        }until((!$newLinks -and !$i) -or $firstResult)            
               
        if($firstResult){
            $totalMinutes=[math]::round($timer.Elapsed.TotalMinutes,2)
            write-host "Minutes elapsed: $totalMinutes"
            return $firstResult
            }
    }

    $totalMinutes=[math]::round($timer.Elapsed.TotalMinutes,2)
    write-host "$linksChecked links have been checked in $totalMinutes mintues without finding file extension $fileExtension" -ForegroundColor Red
    return $false
}
findDownloadUrl $startUrl $fileExtension $maxDepth
PS C:\Users\concu> findDownloadUrl $startUrl $fileExtension $maxDepth
Scanning http://apache.mirrors.pair.com/tomcat/tomcat-9/ for file extension .exe
Processing link at current depth: 1
Parsing http://apache.mirrors.pair.com/tomcat/tomcat-9/v9.0.37/
Adding new links to depth 2:
http://apache.mirrors.pair.com/tomcat/tomcat-9/v9.0.37/src/
http://apache.mirrors.pair.com/tomcat/tomcat-9/v9.0.37/RELEASE-NOTES
https://tomcat.apache.org/tomcat-9.0-doc/deployer-howto.html
https://tomcat.apache.org/tomcat-9.0-doc/changelog.html
https://tomcat.apache.org/tomcat-9.0-doc/building.html
https://tomcat.apache.org/
http://apache.mirrors.pair.com/tomcat/tomcat-9/v9.0.37/bin/
http://apache.mirrors.pair.com/tomcat/tomcat-9/v9.0.37/?C=S;O=A
http://apache.mirrors.pair.com/tomcat/tomcat-9/v9.0.37/?C=N;O=D
http://apache.mirrors.pair.com/tomcat/tomcat-9/v9.0.37/?C=M;O=A
http://apache.mirrors.pair.com/tomcat/tomcat-9/v9.0.37/?C=D;O=A
http://apache.mirrors.pair.com/tomcat/tomcat-9/v9.0.37/tomcat/tomcat-9/
Parsing http://apache.mirrors.pair.com/tomcat/tomcat-9/v9.0.37/src/
Parsing http://apache.mirrors.pair.com/tomcat/tomcat-9/v9.0.37/RELEASE-NOTES
Parsing https://tomcat.apache.org/tomcat-9.0-doc/deployer-howto.html
Parsing https://tomcat.apache.org/tomcat-9.0-doc/changelog.html
Parsing https://tomcat.apache.org/tomcat-9.0-doc/building.html
Parsing https://tomcat.apache.org/
Parsing http://apache.mirrors.pair.com/tomcat/tomcat-9/v9.0.37/bin/
Parsing http://apache.mirrors.pair.com/tomcat/tomcat-9/v9.0.37/?C=S;O=A
Adding new links to depth 3:
http://apache.mirrors.pair.com/tomcat/tomcat-9/v9.0.37/src/apache-tomcat-9.0.37-src.zip
http://apache.mirrors.pair.com/tomcat/tomcat-9/v9.0.37/src/apache-tomcat-9.0.37-src.tar.gz
http://apache.mirrors.pair.com/tomcat/tomcat-9/v9.0.37/src/?C=S;O=A
http://apache.mirrors.pair.com/tomcat/tomcat-9/v9.0.37/src/?C=N;O=D
http://apache.mirrors.pair.com/tomcat/tomcat-9/v9.0.37/src/?C=M;O=A
http://apache.mirrors.pair.com/tomcat/tomcat-9/v9.0.37/src/?C=D;O=A
http://apache.mirrors.pair.com/tomcat/tomcat-9/v9.0.37/src/tomcat/tomcat-9/v9.0.37/
Bingo!
Minutes elapsed: 0.08
http://apache.mirrors.pair.com/tomcat/tomcat-9/v9.0.37/bin/apache-tomcat-9.0.37.exe
# findDownloadUrl_v0.0.1.ps1
# This little algorithm will parse a web page for downloadable contents, matching given extensions
# Scripty currently is not optimized as multi-threading should greatly improve its performance

$startUrl='http :// apache.mirrors.pair.com/tomcat/tomcat-9/'
$fileExtension='.exe'
$maxDepth=2

function findDownloadUrl{
    param(
        $parentUrl,
        $fileExtension,
        $maxDepth=3
    )
    if(!$parentUrl){
        write-warning "Cannot start with a blank parent URL"
    }elseif($parentUrl -notmatch '/$'){
        $parentUrl=$parentUrl+'/'
        }
    $page=Invoke-WebRequest $parentUrl    
    $links=$page.links.href|Select -Unique|sort -Descending|%{$parentUrl+$_}
    $knownLinks=$links

    function findFile($parentUrl,$extension){
        $ProgressPreference='SilentlyContinue'
        $ErrorActionPreference='stop'
        if($parentUrl -notmatch '/$'){$parentUrl=$parentUrl+'/'}
        #if(!([System.Net.WebRequest]::Create($parentUrl)).GetResponse().StatusCode -eq 200){
        #    return @($false,@()) 
        #}
        try{
            $page=Invoke-WebRequest $parentUrl -TimeoutSec 10
        }catch{
            return @($false,@())
            }
        $newLinks=$page.links.href|?{($_ -notlike "*$(Split-Path $parentUrl -parent)") -and ($_ -notmatch '^http')}| `
            sort -Descending|%{$parentUrl+$(
                                if($_[0] -eq '/'){
                                    $_.Substring(1,$_.length-1)
                                }else{
                                    $_
                                }
                            )}|select -Unique
        $matchedExtension=$newLinks|?{$_ -like "*$extension"}|sort -Descending|select -First 1
        if($matchedExtension){
            return @($true,$matchedExtension)
        }elseif($newLinks){
            return @($false,$newLinks)
        }else{
            return @($false,@())
            } 
    }   
    
    foreach ($link in $links){       
        write-host "Checking $link"                
        $currentDepth=1
        $newLinks=@($link)
        do{            
            $thisLink=$newLinks|Select -Unique|select -First 1
            $newLinks=$newLinks[1..($newLinks.count-1)]
            write-host "Parsing $thisLink"
            $result=findFile $thisLink $fileExtension
            if($result[0]){
                write-host "Bingo!" -ForegroundColor Green
                return $result[1]
            }elseif(($currentDepth++ -le $maxDepth) -and ($result[1]|?{$_ -notin $knownLinks})){                
                $addLinks=$result[1]|?{$_ -notin $knownLinks}
                write-host "Adding new links:`r`n$(($addLinks|out-string).trim())"
                $knownLinks+=$addLinks
                $newLinks=$addLinks+$newLinks
                }
        }until(!$newLinks)
    }

    write-host "$linksChecked links have been checked without any matching file extension $extension" -ForegroundColor Red
    return $false
}
findDownloadUrl $startUrl $fileExtension $maxDepth

Leave a Reply

Your email address will not be published. Required fields are marked *