|
|
using System.Text.RegularExpressions;
...
MatchCollection mc = Regex.Matches(deinHTMLText, @"<title>(.+?)</title>",RegexOptions.Singleline);
foreach(Match m in mc)
Console.WriteLine(m.Groups[1].Value);
public string ExtractString(source,pre,after){
int start = source.indexOf(pre);
int ende = source.indexOf(after,start);
if(start >= ende) return "";
return = source.Substring(start+pre.length,ende-start-pre.length);
}
|
|
using System;
using System.Diagnostics;
using System.IO;
using System.Net;
using System.Net.Security;
using System.Text;
using System.Threading;
namespace Netzwerkfunktionen.HTTP_Post_Request {
public class HTTP_Post_Request {
private String _UserAgent = "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; InfoPath.2; .NET CLR 2.0.50727; .NET CLR 3.0.04506.648; .NET CLR 3.5.21022)";
private RemoteCertificateValidationCallback ignoreInvalidCertificateValidationCallback = delegate { return true; }; // aktzeptiert alle Zertifikate
//CookieContainer - Instant erzeugen
public CookieContainer objCookieContainer = new CookieContainer();
/// <summary>
/// Ruft einen Wert ab oder legt einen Wert fest, der angibt, ob auch vom Betriebssystem als unsicher betrachtete (SSL-)Zertifikate erlaubt werden
/// </summary>
public bool IgnoreInvalidCertificates {
get { return (ServicePointManager.ServerCertificateValidationCallback == ignoreInvalidCertificateValidationCallback); }
set { ServicePointManager.ServerCertificateValidationCallback = (value) ? ignoreInvalidCertificateValidationCallback : null; }
}
/// <summary>
/// Ruft einen Wert ab oder legt einen Wert fest, der angiebt, welcher HTTP-UserAgend-Header gesetzt werden soll
/// </summary>
public String UserAgent {
get { return this._UserAgent; }
set { this._UserAgent = value; }
}
[DebuggerStepThrough]
public string RequestPost(string url, string VIEWSTATE, string EVENTVALIDATION, string Method, string Referer, params WebPostEntry[] arrPostData) {
return RequestPost(url, VIEWSTATE, EVENTVALIDATION, Method, Referer, Encoding.UTF8, arrPostData);
}
[DebuggerStepThrough]
public string RequestPost(string url, string VIEWSTATE, string EVENTVALIDATION, string Method, string Referer, Encoding encoding, params WebPostEntry[] arrPostData) {
MemoryStream receiveStream = this.RequestPost_Stream(url, VIEWSTATE, EVENTVALIDATION, Method, Referer, encoding, arrPostData);
if (receiveStream != null && receiveStream.CanRead) {
StreamReader readStream = new StreamReader(receiveStream, encoding);
string strResponse = readStream.ReadToEnd();
readStream.Close();
receiveStream.Close();
receiveStream.Dispose();
return strResponse;
}
return "";
}
[DebuggerStepThrough]
public MemoryStream RequestPost_Stream(string url, string VIEWSTATE, string EVENTVALIDATION, string Method, string Referer, Encoding encoding, params WebPostEntry[] arrPostData) {
HttpWebResponse objResponse = default(HttpWebResponse);
HttpWebRequest objWR = default(HttpWebRequest);
Int16 retryCount = 0;
while (retryCount <= 1) {
try {
string strPostData = String.Empty;
if (arrPostData != null)
strPostData = WebPostEntry.FromArrayToString(arrPostData);
objWR = Method == "GET" & !string.IsNullOrEmpty(strPostData)
? (HttpWebRequest)WebRequest.Create(url + "?" + strPostData)
: (HttpWebRequest)WebRequest.Create(url);
objWR.CookieContainer = objCookieContainer;
objWR.UserAgent = this._UserAgent;
objWR.Method = Method.ToUpper();
objWR.Referer = Referer;
if (Method.ToUpper() == "POST" & !string.IsNullOrEmpty(strPostData)) {
objWR.ContentType = "application/x-www-form-urlencoded";
byte[] bytPostData = encoding.GetBytes(strPostData);
objWR.ContentLength = bytPostData.Length;
Stream objRequestStream = objWR.GetRequestStream();
objRequestStream.Write(bytPostData, 0, bytPostData.Length);
objRequestStream.Close();
}
objResponse = (HttpWebResponse)objWR.GetResponse();
if ((objResponse.StatusCode != HttpStatusCode.OK)) {
return null;
}
else {
const int bufferSize = 256;
byte[] buffer = new byte[bufferSize];
MemoryStream memStream = new MemoryStream();
using (Stream receiveStream = objResponse.GetResponseStream()) {
BinaryWriter bw = new BinaryWriter(memStream);
while (receiveStream.CanRead) {
int countBytesRead = receiveStream.Read(buffer, 0, bufferSize);
if (countBytesRead == 0) break;
bw.Write(buffer, 0, countBytesRead);
}
bw.Flush();
//bw.Close();
//-> MSDN: Closes the current BinaryWriter and the underlying stream.
//-> Wir wollen aber das der Stream offen bleibt, also Schließen wir den BinaryWriter nicht
}
if (memStream.CanSeek)
memStream.Position = 0;
return memStream;
}
}
catch (Exception ex) {
if (retryCount <= 1 && ex.Message.Contains("502") && ex.Message.Contains("Bad Gateway")) {
retryCount++;
Thread.Sleep(1000);
}
else {
throw;
}
}
finally {
if (objResponse != null)
objResponse.Close();
}
}
return null;
}
}
}
using System.Diagnostics;
namespace Netzwerkfunktionen.HTTP_Post_Request {
public class WebPostEntry {
public string Name;
public string Value;
[DebuggerStepThrough]
public WebPostEntry(string name, string value) {
this.Name = name;
this.Value = value;
}
[DebuggerStepThrough]
public override string ToString() {
return "&" + System.Web.HttpUtility.UrlEncode(this.Name) + "=" + System.Web.HttpUtility.UrlEncode(this.Value, System.Text.Encoding.GetEncoding(1250));
}
[DebuggerStepThrough]
public static string FromArrayToString(WebPostEntry[] list) {
string tempRC = "";
foreach (var entry in list) {
if ((entry == null)) continue;
tempRC += entry.ToString();
}
if (tempRC.StartsWith("&")) tempRC = tempRC.Remove(0, 1);
return tempRC;
}
}
}
private HTTP_Post_Request requestObject = new HTTP_Post_Request();
HTML= requestObject.RequestPost("http://www.meineWebseite.de/Formular1.php" _
, New WebPostEntry("Feld1", "true"), _
, New WebPostEntry("Feld2", "11"), _
, New WebPostEntry("Feld3", "0"), _
, New WebPostEntry("Feld4", "ABC") _
);
Regex.Matches(strResult.Replace(vbCrLf, ""), "<INPUT ([^>]*)>", RegexOptions.Singleline Or RegexOptions.IgnoreCase)
|
|
|
|
|
Also ich tuh auf Arbeit einige Webseiten mittels Regex im großen stil parsen. RegEx, richtig eingesetzt ist mächtiger als DOM. Jedenfalls meiner Meinung nach.
– Floyd 16.09.2009
|
||
| 1 |
@Floyd da ist definitiv zu wiedersprechen:
Du kannst zwar mit RegExp so ziemlich alles machen, aber bei Rekursionen z.B. wird es ziemlich umständlich. Stell dir einfach mal folgendes beispiel vor: [code]<span>Außen <span>Mitte <span>Innen1</span> <span>Innen2</span> Mitte </span> Außen</span>[/code] Ich geb zu, dass der DOM Overhead für kleine Aufgaben (Überschriften extrahieren oder so) oder bei gut-geformten Webseiten erstmal ziemlich abschreckend wirkt. Aber auf Dauer fährst du damit einfach sicherer und sparst dir meiner Ansicht nach viel Mühe. – kleingeist 20.09.2009
|
|
|
Das problem an DOM ist das du im Vergleich mit RegEx unflexibel bist. Regex kann es in vielen Fällen egal sein, ob noch HMTL-Elemente nachträglich hinzugefügt wurden. Bei DOM musst du den Parse-Weg dahingegend anpassen.
– Floyd 25.09.2009
|
public void get(string URL, string REF, string PATH)
{
HttpWebRequest req = (HttpWebRequest)WebRequest.Create(URL);
req.AllowAutoRedirect = true;
req.Referer = REF;
req.Timeout = 30000;
req.ContentType = "application/x-www-form-urlencoded";
req.Accept = "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8";
req.UserAgent = "Mozilla/5.0 (Windows; U; Windows NT 6.0; de; rv:1.9.0.5) Gecko/2008120122 Firefox/3.0.5 (.NET CLR 3.5.30729)";
HttpWebResponse res = (HttpWebResponse)req.GetResponse();
Stream resstream = res.GetResponseStream();
StreamWriter writer = new StreamWriter(PATH);
StreamReader reader = new StreamReader(resstream);
SrvResponse = reader.ReadToEnd();
writer.Write(SrvResponse);
writer.Close();
reader.Close();
res.Close();
resstream.Close();
}
public void post(string URL, string REF, string PATH, string PostData)
{
byte[] bytes = Encoding.ASCII.GetBytes(PostData);
HttpWebRequest req = (HttpWebRequest)WebRequest.Create(URL);
req.AllowAutoRedirect = true;
req.Method = "POST";
req.Referer = REF;
req.Timeout = 30000;
req.ContentType = "application/x-www-form-urlencoded";
req.Accept = "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8";
req.UserAgent = "Mozilla/5.0 (Windows; U; Windows NT 6.0; de; rv:1.9.0.5) Gecko/2008120122 Firefox/3.0.5 (.NET CLR 3.5.30729)";
req.ContentLength = bytes.Length;
Stream os = req.GetRequestStream();
os.Write(bytes, 0, bytes.Length);
HttpWebResponse res = (HttpWebResponse)req.GetResponse();
Stream resstream = res.GetResponseStream();
StreamWriter writer = new StreamWriter(PATH);
StreamReader reader = new StreamReader(resstream);
SrvResponse = reader.ReadToEnd();
writer.Write(SrvResponse);
writer.Close();
reader.Close();
res.Close();
resstream.Close();
}
|
|