Quantcast
Channel: VBForums - Visual Basic .NET
Viewing all articles
Browse latest Browse all 27407

VS 2012 Multiple threading for my program issues!

$
0
0
Hello! Newbie here :)

I am trying to make multiple threads for my Web Crawler program. I am expanding upon some old code I found lying around the internet.
It needs to be multi threaded or else everything freezes and gives no response or output until it is done what it is doing (which can take anywhere from 30 seconds to an hour).
I know how to make threads but since this is not entirely my code and the comments are not the greatest, I haven't been able to implement the threads.

Can anybody help me out?

Here is the code below.

Code:

Imports System.IO
Imports System.Net
Imports System.Text.RegularExpressions

Public Class web_crawler

    Private Sub startCrawl_Click(ByVal sender As System.Object, ByVal e As System.EventArgs) Handles startCrawl.Click
        output.Items.Clear()

        currentOperation.Text = "Begining Spider to find links..."
        Dim aList As ArrayList = Spider(inputURL.Text, depth.Text)

        currentOperation.Text = "Populating list of URLS..."
        For Each url As String In aList
            Dim crawlReq As WebClient = New WebClient
            crawlReq.Headers("User-Agent") = "MCLMATTY CRAWLER"
            currentOperation.Text = "Requesting " & url & "..."
            Dim arr() As Byte = crawlReq.DownloadData("http://search-engine.crawler.mclmatty.net/crawler.php?url=" & url)
            output.Items.Add("Found: " & url)
        Next

        currentOperation.Text = "Done. Awating new user input."
        output.Items.Add("Done")
    End Sub
    Private Function Spider(ByVal url As String, ByVal depth As Integer) As ArrayList
        'aReturn is used to hold the list of urls
        currentOperation.Text = "Creating Arrays..."
        Dim aReturn As New ArrayList
        'aStart is used to hold the new urls to be checked
        Dim aStart As ArrayList = GrabUrls(url)
        'temp array to hold data being passed to new arrays
        Dim aTemp As ArrayList
        'aNew is used to hold new urls before being passed to aStart
        Dim aNew As New ArrayList
        'add the first batch of urls
        currentOperation.Text = "Populating Arrays..."
        aReturn.AddRange(aStart)
        'if depth is 0 then only return 1 page
        If depth < 1 Then Return aReturn
        'loops through the levels of urls
        For i = 1 To depth
            'grabs the urls from each url in aStart
            For Each tUrl As String In aStart
                'grabs the urls and returns non-duplicates
                aTemp = GrabUrls(tUrl, aReturn, aNew)
                'add the urls to be check to aNew
                aNew.AddRange(aTemp)
            Next
            'swap urls to aStart to be checked
            aStart = aNew
            'add the urls to the main list
            aReturn.AddRange(aNew)
            'clear the temp array
            aNew = New ArrayList
        Next
        Return aReturn
        loopyThread.Abort()
    End Function
    Private Overloads Function GrabUrls(ByVal url As String) As ArrayList
        'will hold the urls to be returned
        Dim aReturn As New ArrayList
        Try
            'regex string used: thanks google
            currentOperation.Text = "Matching reqular expressions..."
            Dim strRegex As String = "<a.*?href=""(.*?)"".*?>(.*?)</a>"
            'Some regex notes... <a.*?href=""(.*?)"".*?>(.*?)</a>
            'And again... (?<=href\="")[^]+?(?="")

            'i used a webclient to get the source
            'web requests might be faster
            Dim wc As New WebClient
            'put the source into a string
            Dim strSource As String = wc.DownloadString(url)
            Dim HrefRegex As New Regex(strRegex, RegexOptions.IgnoreCase Or RegexOptions.Compiled)
            'parse the urls from the source
            Dim HrefMatch As Match = HrefRegex.Match(strSource)
            'used later to get the base domain without subdirectories or pages
            Dim BaseUrl As New Uri(url)
            'while there are urls
            While HrefMatch.Success = True
                'loop through the matches
                Dim sUrl As String = HrefMatch.Groups(1).Value
                'if it's a page or sub directory with no base url (domain)
                If Not sUrl.Contains("http://") Or sUrl.Contains("https://") AndAlso Not sUrl.Contains("www") Then
                    'add the domain plus the page
                    Dim tURi As New Uri(BaseUrl, sUrl)
                    sUrl = tURi.ToString
                End If
                'if it's not already in the list then add it
                If Not aReturn.Contains(sUrl) Then aReturn.Add(sUrl)
                'go to the next url
                HrefMatch = HrefMatch.NextMatch
            End While
        Catch ex As Exception
            'catch ex here. I left it blank while debugging
        End Try

        Return aReturn
    End Function
    Private Overloads Function GrabUrls(ByVal url As String, ByRef aReturn As ArrayList, ByRef aNew As ArrayList) As ArrayList
        'overloads function to check duplicates in aNew and aReturn
        'temp url arraylist
        Dim tUrls As ArrayList = GrabUrls(url)
        'used to return the list
        Dim tReturn As New ArrayList
        'check each item to see if it exists, so not to grab the urls again
        For Each item As String In tUrls
            If Not aReturn.Contains(item) AndAlso Not aNew.Contains(item) Then
                tReturn.Add(item)
            End If
        Next
        Return tReturn
    End Function
End Class


Viewing all articles
Browse latest Browse all 27407

Trending Articles



<script src="https://jsc.adskeeper.com/r/s/rssing.com.1596347.js" async> </script>